diff -ruN -x .svn -x build lucene-clean-trunk/lucene/build.xml lucene-trunk/lucene/build.xml --- lucene-clean-trunk/lucene/build.xml 2011-05-22 12:38:26.000000000 -0400 +++ lucene-trunk/lucene/build.xml 2011-05-22 18:53:18.000000000 -0400 @@ -231,7 +231,6 @@ - @@ -256,7 +255,6 @@ - diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/build.xml lucene-trunk/lucene/contrib/spellchecker/build.xml --- lucene-clean-trunk/lucene/contrib/spellchecker/build.xml 2011-05-22 12:38:17.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/build.xml 1969-12-31 19:00:00.000000000 -0500 @@ -1,43 +0,0 @@ - - - - - - - - Spell Checker - - - - - - - - - - - - - - - - - - - diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,35 +0,0 @@ -package org.apache.lucene.search.spell; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Iterator; - -/** - * A simple interface representing a Dictionary. A Dictionary - * here is just a list of words. - * - * - * @version 1.0 - */ -public interface Dictionary { - - /** - * Return all words present in the dictionary - * @return Iterator - */ - Iterator getWordsIterator(); -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,487 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.Locale; -import java.util.PriorityQueue; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.FuzzyTermsEnum; -import org.apache.lucene.search.BoostAttribute; -import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.LevenshteinAutomata; - -/** - * Simple automaton-based spellchecker. - *

- * Candidates are presented directly from the term dictionary, based on - * Levenshtein distance. This is an alternative to {@link SpellChecker} - * if you are using an edit-distance-like metric such as Levenshtein - * or {@link JaroWinklerDistance}. - *

- * A practical benefit of this spellchecker is that it requires no additional - * datastructures (neither in RAM nor on disk) to do its work. - * - * @see LevenshteinAutomata - * @see FuzzyTermsEnum - * - * @lucene.experimental - */ -public class DirectSpellChecker { - /** The default StringDistance, Levenshtein distance implemented internally - * via {@link LevenshteinAutomata}. - *

- * Note: this is the fastest distance metric, because Levenshtein is used - * to draw candidates from the term dictionary: this just re-uses the scoring. - *

- * Note also that this metric differs in subtle ways from {@link LevensteinDistance}: - *

    - *
  • This metric treats full unicode codepoints as characters, but - * LevenshteinDistance calculates based on UTF-16 code units. - *
  • This metric scales raw edit distances into a floating point score - * differently than LevenshteinDistance: the scaling is based upon the - * shortest of the two terms instead of the longest. - *
- */ - public static final StringDistance INTERNAL_LEVENSHTEIN = new StringDistance() { - public float getDistance(String s1, String s2) { - throw new UnsupportedOperationException("Not for external use."); - }}; - - /** maximum edit distance for candidate terms */ - private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; - /** minimum prefix for candidate terms */ - private int minPrefix = 1; - /** maximum number of top-N inspections per suggestion */ - private int maxInspections = 5; - /** minimum accuracy for a term to match */ - private float accuracy = SpellChecker.DEFAULT_ACCURACY; - /** value in [0..1] (or absolute number >=1) representing the minimum - * number of documents (of the total) where a term should appear. */ - private float thresholdFrequency = 0f; - /** minimum length of a query word to return suggestions */ - private int minQueryLength = 4; - /** value in [0..1] (or absolute number >=1) representing the maximum - * number of documents (of the total) a query term can appear in to - * be corrected. */ - private float maxQueryFrequency = 0.01f; - /** true if the spellchecker should lowercase terms */ - private boolean lowerCaseTerms = true; - /** the comparator to use */ - private Comparator comparator = SuggestWordQueue.DEFAULT_COMPARATOR; - /** the string distance to use */ - private StringDistance distance = INTERNAL_LEVENSHTEIN; - - /** Get the maximum number of Levenshtein edit-distances to draw - * candidate terms from. */ - public int getMaxEdits() { - return maxEdits; - } - - /** Sets the maximum number of Levenshtein edit-distances to draw - * candidate terms from. This value can be 1 or 2. The default is 2. - *

- * Note: a large number of spelling errors occur with an edit distance - * of 1, by setting this value to 1 you can increase both performance - * and precision at the cost of recall. - */ - public void setMaxEdits(int maxEdits) { - if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) - throw new UnsupportedOperationException("Invalid maxEdits"); - this.maxEdits = maxEdits; - } - - /** - * Get the minimal number of characters that must match exactly - */ - public int getMinPrefix() { - return minPrefix; - } - - /** - * Sets the minimal number of initial characters (default: 1) - * that must match exactly. - *

- * This can improve both performance and accuracy of results, - * as misspellings are commonly not the first character. - */ - public void setMinPrefix(int minPrefix) { - this.minPrefix = minPrefix; - } - - /** - * Get the maximum number of top-N inspections per suggestion - */ - public int getMaxInspections() { - return maxInspections; - } - - /** - * Set the maximum number of top-N inspections (default: 5) per suggestion. - *

- * Increasing this number can improve the accuracy of results, at the cost - * of performance. - */ - public void setMaxInspections(int maxInspections) { - this.maxInspections = maxInspections; - } - - /** - * Get the minimal accuracy from the StringDistance for a match - */ - public float getAccuracy() { - return accuracy; - } - - /** - * Set the minimal accuracy required (default: 0.5f) from a StringDistance - * for a suggestion match. - */ - public void setAccuracy(float accuracy) { - this.accuracy = accuracy; - } - - /** - * Get the minimal threshold of documents a term must appear for a match - */ - public float getThresholdFrequency() { - return thresholdFrequency; - } - - /** - * Set the minimal threshold of documents a term must appear for a match. - *

- * This can improve quality by only suggesting high-frequency terms. Note that - * very high values might decrease performance slightly, by forcing the spellchecker - * to draw more candidates from the term dictionary, but a practical value such - * as 1 can be very useful towards improving quality. - *

- * This can be specified as a relative percentage of documents such as 0.5f, - * or it can be specified as an absolute whole document frequency, such as 4f. - * Absolute document frequencies may not be fractional. - */ - public void setThresholdFrequency(float thresholdFrequency) { - if (thresholdFrequency >= 1f && thresholdFrequency != (int) thresholdFrequency) - throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed"); - this.thresholdFrequency = thresholdFrequency; - } - - /** Get the minimum length of a query term needed to return suggestions */ - public int getMinQueryLength() { - return minQueryLength; - } - - /** - * Set the minimum length of a query term (default: 4) needed to return suggestions. - *

- * Very short query terms will often cause only bad suggestions with any distance - * metric. - */ - public void setMinQueryLength(int minQueryLength) { - this.minQueryLength = minQueryLength; - } - - /** - * Get the maximum threshold of documents a query term can appear in order - * to provide suggestions. - */ - public float getMaxQueryFrequency() { - return maxQueryFrequency; - } - - /** - * Set the maximum threshold (default: 0.01f) of documents a query term can - * appear in order to provide suggestions. - *

- * Very high-frequency terms are typically spelled correctly. Additionally, - * this can increase performance as it will do no work for the common case - * of correctly-spelled input terms. - *

- * This can be specified as a relative percentage of documents such as 0.5f, - * or it can be specified as an absolute whole document frequency, such as 4f. - * Absolute document frequencies may not be fractional. - */ - public void setMaxQueryFrequency(float maxQueryFrequency) { - if (maxQueryFrequency >= 1f && maxQueryFrequency != (int) maxQueryFrequency) - throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed"); - this.maxQueryFrequency = maxQueryFrequency; - } - - /** true if the spellchecker should lowercase terms */ - public boolean getLowerCaseTerms() { - return lowerCaseTerms; - } - - /** - * True if the spellchecker should lowercase terms (default: true) - *

- * This is a convenience method, if your index field has more complicated - * analysis (such as StandardTokenizer removing punctuation), its probably - * better to turn this off, and instead run your query terms through your - * Analyzer first. - *

- * If this option is not on, case differences count as an edit! - */ - public void setLowerCaseTerms(boolean lowerCaseTerms) { - this.lowerCaseTerms = lowerCaseTerms; - } - - /** - * Get the current comparator in use. - */ - public Comparator getComparator() { - return comparator; - } - - /** - * Set the comparator for sorting suggestions. - * The default is {@link SuggestWordQueue#DEFAULT_COMPARATOR} - */ - public void setComparator(Comparator comparator) { - this.comparator = comparator; - } - - /** - * Get the string distance metric in use. - */ - public StringDistance getDistance() { - return distance; - } - - /** - * Set the string distance metric. - * The default is {@link #INTERNAL_LEVENSHTEIN} - *

- * Note: because this spellchecker draws its candidates from the - * term dictionary using Levenshtein, it works best with an edit-distance-like - * string metric. If you use a different metric than the default, - * you might want to consider increasing {@link #setMaxInspections(int)} - * to draw more candidates for your metric to rank. - */ - public void setDistance(StringDistance distance) { - this.distance = distance; - } - - /** - * Calls {@link #suggestSimilar(Term, int, IndexReader, boolean) - * suggestSimilar(term, numSug, ir, false)} - */ - public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir) - throws IOException { - return suggestSimilar(term, numSug, ir, false); - } - - /** - * Calls {@link #suggestSimilar(Term, int, IndexReader, boolean, float) - * suggestSimilar(term, numSug, ir, morePopular, this.accuracy)} - */ - public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, - boolean morePopular) throws IOException { - return suggestSimilar(term, numSug, ir, morePopular, accuracy); - } - - /** - * Suggest similar words. - * - *

Unlike {@link SpellChecker}, the similarity used to fetch the most - * relevant terms is an edit distance, therefore typically a low value - * for numSug will work very well. - * - * @param term Term you want to spell check on - * @param numSug the maximum number of suggested words - * @param ir IndexReader to find terms from - * @param morePopular return only suggested words that are as frequent or more frequent than the searched word - * @param accuracy return only suggested words that match with this similarity - * @return sorted list of the suggested words according to the comparator - * @throws IOException - */ - public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, - boolean morePopular, float accuracy) throws IOException { - - String text = term.text(); - if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength) - return new SuggestWord[0]; - - if (lowerCaseTerms) - term = term.createTerm(text.toLowerCase(Locale.ENGLISH)); - - int docfreq = ir.docFreq(term); - - // see line 341 of spellchecker. this is certainly very very nice for perf, - // but is it really the right way to go? - if (!morePopular && docfreq > 0) { - return new SuggestWord[0]; - } - - int maxDoc = ir.maxDoc(); - - if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) { - return new SuggestWord[0]; - } else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float)maxDoc)) { - return new SuggestWord[0]; - } - - if (!morePopular) docfreq = 0; - - if (thresholdFrequency >= 1f) { - docfreq = Math.max(docfreq, (int) thresholdFrequency); - } else if (thresholdFrequency > 0f) { - docfreq = Math.max(docfreq, (int)(thresholdFrequency * (float)maxDoc)-1); - } - - Collection terms = null; - int inspections = numSug * maxInspections; - - // try ed=1 first, in case we get lucky - terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy); - if (maxEdits > 1 && terms.size() < inspections) { - HashSet moreTerms = new HashSet(); - moreTerms.addAll(terms); - moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy)); - terms = moreTerms; - } - - // create the suggestword response, sort it, and trim it to size. - - SuggestWord suggestions[] = new SuggestWord[terms.size()]; - int index = suggestions.length - 1; - for (ScoreTerm s : terms) { - SuggestWord suggestion = new SuggestWord(); - suggestion.string = s.termAsString != null ? s.termAsString : s.term.utf8ToString(); - suggestion.score = s.score; - suggestion.freq = s.docfreq; - suggestions[index--] = suggestion; - } - - ArrayUtil.mergeSort(suggestions, Collections.reverseOrder(comparator)); - if (numSug < suggestions.length) { - SuggestWord trimmed[] = new SuggestWord[numSug]; - System.arraycopy(suggestions, 0, trimmed, 0, numSug); - suggestions = trimmed; - } - return suggestions; - } - - private Collection suggestSimilar(Term term, int numSug, - IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException { - - AttributeSource atts = new AttributeSource(); - MaxNonCompetitiveBoostAttribute maxBoostAtt = - atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); - FuzzyTermsEnum e = new FuzzyTermsEnum(MultiFields.getTerms(ir, term.field()).iterator(), atts, term, editDistance, Math.max(minPrefix, editDistance-1)); - final PriorityQueue stQueue = new PriorityQueue(); - - BytesRef queryTerm = new BytesRef(term.text()); - BytesRef candidateTerm; - ScoreTerm st = new ScoreTerm(); - BoostAttribute boostAtt = - e.attributes().addAttribute(BoostAttribute.class); - while ((candidateTerm = e.next()) != null) { - final float boost = boostAtt.getBoost(); - // ignore uncompetitive hits - if (stQueue.size() >= numSug && boost <= stQueue.peek().boost) - continue; - - // ignore exact match of the same term - if (queryTerm.bytesEquals(candidateTerm)) - continue; - - int df = e.docFreq(); - - // check docFreq if required - if (df <= docfreq) - continue; - - final float score; - final String termAsString; - if (distance == INTERNAL_LEVENSHTEIN) { - // delay creating strings until the end - termAsString = null; - // undo FuzzyTermsEnum's scale factor for a real scaled lev score - score = boost / e.getScaleFactor() + e.getMinSimilarity(); - } else { - termAsString = candidateTerm.utf8ToString(); - score = distance.getDistance(term.text(), termAsString); - } - - if (score < accuracy) - continue; - - // add new entry in PQ - st.term = new BytesRef(candidateTerm); - st.boost = boost; - st.docfreq = df; - st.termAsString = termAsString; - st.score = score; - stQueue.offer(st); - // possibly drop entries from queue - st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); - maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); - } - - return stQueue; - } - - private static class ScoreTerm implements Comparable { - public BytesRef term; - public float boost; - public int docfreq; - - public String termAsString; - public float score; - - public int compareTo(ScoreTerm other) { - if (term.bytesEquals(other.term)) - return 0; // consistent with equals - if (this.boost == other.boost) - return other.term.compareTo(this.term); - else - return Float.compare(this.boost, other.boost); - } - - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + ((term == null) ? 0 : term.hashCode()); - return result; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; - ScoreTerm other = (ScoreTerm) obj; - if (term == null) { - if (other.term != null) return false; - } else if (!term.bytesEquals(other.term)) return false; - return true; - } - } -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,112 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Arrays; - -public class JaroWinklerDistance implements StringDistance { - - private float threshold = 0.7f; - - private int[] matches(String s1, String s2) { - String max, min; - if (s1.length() > s2.length()) { - max = s1; - min = s2; - } else { - max = s2; - min = s1; - } - int range = Math.max(max.length() / 2 - 1, 0); - int[] matchIndexes = new int[min.length()]; - Arrays.fill(matchIndexes, -1); - boolean[] matchFlags = new boolean[max.length()]; - int matches = 0; - for (int mi = 0; mi < min.length(); mi++) { - char c1 = min.charAt(mi); - for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max - .length()); xi < xn; xi++) { - if (!matchFlags[xi] && c1 == max.charAt(xi)) { - matchIndexes[mi] = xi; - matchFlags[xi] = true; - matches++; - break; - } - } - } - char[] ms1 = new char[matches]; - char[] ms2 = new char[matches]; - for (int i = 0, si = 0; i < min.length(); i++) { - if (matchIndexes[i] != -1) { - ms1[si] = min.charAt(i); - si++; - } - } - for (int i = 0, si = 0; i < max.length(); i++) { - if (matchFlags[i]) { - ms2[si] = max.charAt(i); - si++; - } - } - int transpositions = 0; - for (int mi = 0; mi < ms1.length; mi++) { - if (ms1[mi] != ms2[mi]) { - transpositions++; - } - } - int prefix = 0; - for (int mi = 0; mi < min.length(); mi++) { - if (s1.charAt(mi) == s2.charAt(mi)) { - prefix++; - } else { - break; - } - } - return new int[] { matches, transpositions / 2, prefix, max.length() }; - } - - public float getDistance(String s1, String s2) { - int[] mtp = matches(s1, s2); - float m = mtp[0]; - if (m == 0) { - return 0f; - } - float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3; - float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2] - * (1 - j); - return jw; - } - - /** - * Sets the threshold used to determine when Winkler bonus should be used. - * Set to a negative value to get the Jaro distance. - * @param threshold the new value of the threshold - */ - public void setThreshold(float threshold) { - this.threshold = threshold; - } - - /** - * Returns the current value of the threshold used for adding the Winkler bonus. - * The default value is 0.7. - * @return the current value of the threshold - */ - public float getThreshold() { - return threshold; - } -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,109 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Levenstein edit distance class. - */ -public final class LevensteinDistance implements StringDistance { - - /** - * Optimized to run a bit faster than the static getDistance(). - * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster. - */ - public LevensteinDistance () { - } - - - //***************************** - // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String) - //***************************** - public float getDistance (String target, String other) { - char[] sa; - int n; - int p[]; //'previous' cost array, horizontally - int d[]; // cost array, horizontally - int _d[]; //placeholder to assist in swapping p and d - - /* - The difference between this impl. and the previous is that, rather - than creating and retaining a matrix of size s.length()+1 by t.length()+1, - we maintain two single-dimensional arrays of length s.length()+1. The first, d, - is the 'current working' distance array that maintains the newest distance cost - counts as we iterate through the characters of String s. Each time we increment - the index of String t we are comparing, d is copied to p, the second int[]. Doing so - allows us to retain the previous cost counts as required by the algorithm (taking - the minimum of the cost count to the left, up one, and diagonally up and to the left - of the current cost count being calculated). (Note that the arrays aren't really - copied anymore, just switched...this is clearly much better than cloning an array - or doing a System.arraycopy() each time through the outer loop.) - - Effectively, the difference between the two implementations is this one does not - cause an out of memory condition when calculating the LD over two very large strings. - */ - - sa = target.toCharArray(); - n = sa.length; - p = new int[n+1]; - d = new int[n+1]; - - final int m = other.length(); - if (n == 0 || m == 0) { - if (n == m) { - return 1; - } - else { - return 0; - } - } - - - // indexes into strings s and t - int i; // iterates through s - int j; // iterates through t - - char t_j; // jth character of t - - int cost; // cost - - for (i = 0; i<=n; i++) { - p[i] = i; - } - - for (j = 1; j<=m; j++) { - t_j = other.charAt(j-1); - d[0] = j; - - for (i=1; i<=n; i++) { - cost = sa[i-1]==t_j ? 0 : 1; - // minimum of cell to the left+1, to the top+1, diagonally left and up +cost - d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost); - } - - // copy current distance counts to 'previous row' distance counts - _d = p; - p = d; - d = _d; - } - - // our last action in the above loop was to switch d and p, so p now - // actually has the most recent cost counts - return 1.0f - ((float) p[n] / Math.max(other.length(), sa.length)); - } - -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,96 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.index.IndexReader; - -import java.util.Iterator; - -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.util.StringHelper; - -import java.io.*; - -/** - * Lucene Dictionary: terms taken from the given field - * of a Lucene index. - * - * When using IndexReader.terms(Term) the code must not call next() on TermEnum - * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6 - * - * - * - */ -public class LuceneDictionary implements Dictionary { - private IndexReader reader; - private String field; - - public LuceneDictionary(IndexReader reader, String field) { - this.reader = reader; - this.field = StringHelper.intern(field); - } - - public final Iterator getWordsIterator() { - return new LuceneIterator(); - } - - - final class LuceneIterator implements Iterator { - private TermsEnum termsEnum; - private BytesRef pendingTerm; - - LuceneIterator() { - try { - final Terms terms = MultiFields.getTerms(reader, field); - if (terms != null) { - termsEnum = terms.iterator(); - pendingTerm = termsEnum.next(); - } - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - public String next() { - if (pendingTerm == null) { - return null; - } - - String result = pendingTerm.utf8ToString(); - - try { - pendingTerm = termsEnum.next(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - return result; - } - - public boolean hasNext() { - return pendingTerm != null; - } - - public void remove() { - throw new UnsupportedOperationException(); - } - } -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,144 +0,0 @@ -package org.apache.lucene.search.spell; - -/** -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -/** - * N-Gram version of edit distance based on paper by Grzegorz Kondrak, - * "N-gram similarity and distance". Proceedings of the Twelfth International - * Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126, - * Buenos Aires, Argentina, November 2005. - * http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf - * - * This implementation uses the position-based optimization to compute partial - * matches of n-gram sub-strings and adds a null-character prefix of size n-1 - * so that the first character is contained in the same number of n-grams as - * a middle character. Null-character prefix matches are discounted so that - * strings with no matching characters will return a distance of 0. - * - */ -public class NGramDistance implements StringDistance { - - private int n; - - /** - * Creates an N-Gram distance measure using n-grams of the specified size. - * @param size The size of the n-gram to be used to compute the string distance. - */ - public NGramDistance(int size) { - this.n = size; - } - - /** - * Creates an N-Gram distance measure using n-grams of size 2. - */ - public NGramDistance() { - this(2); - } - - public float getDistance(String source, String target) { - final int sl = source.length(); - final int tl = target.length(); - - if (sl == 0 || tl == 0) { - if (sl == tl) { - return 1; - } - else { - return 0; - } - } - - int cost = 0; - if (sl < n || tl < n) { - for (int i=0,ni=Math.min(sl,tl);iFormat allowed: 1 word per line:
- * word1
- * word2
- * word3
- */ -public class PlainTextDictionary implements Dictionary { - - private BufferedReader in; - private String line; - private boolean hasNextCalled; - - public PlainTextDictionary(File file) throws FileNotFoundException { - in = new BufferedReader(new FileReader(file)); - } - - public PlainTextDictionary(InputStream dictFile) { - in = new BufferedReader(new InputStreamReader(dictFile)); - } - - /** - * Creates a dictionary based on a reader. - */ - public PlainTextDictionary(Reader reader) { - in = new BufferedReader(reader); - } - - public Iterator getWordsIterator() { - return new fileIterator(); - } - - final class fileIterator implements Iterator { - public String next() { - if (!hasNextCalled) { - hasNext(); - } - hasNextCalled = false; - return line; - } - - public boolean hasNext() { - hasNextCalled = true; - try { - line = in.readLine(); - } catch (IOException ex) { - throw new RuntimeException(ex); - } - return (line != null) ? true : false; - } - - public void remove() { - throw new UnsupportedOperationException(); - } - } - -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,724 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; - -import org.apache.lucene.analysis.core.WhitespaceAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.TieredMergePolicy; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.store.AlreadyClosedException; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.ReaderUtil; -import org.apache.lucene.util.Version; - -/** - *

- * Spell Checker class (Main class)
- * (initially inspired by the David Spencer code). - *

- * - *

Example Usage: - * - *

- *  SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
- *  // To index a field of a user index:
- *  spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
- *  // To index a file containing words:
- *  spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
- *  String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
- * 
- * - * - * @version 1.0 - */ -public class SpellChecker implements java.io.Closeable { - - /** - * The default minimum score to use, if not specified by calling {@link #setAccuracy(float)} . - */ - public static final float DEFAULT_ACCURACY = 0.5f; - - /** - * Field name for each word in the ngram index. - */ - public static final String F_WORD = "word"; - - private static final Term F_WORD_TERM = new Term(F_WORD); - - /** - * the spell index - */ - // don't modify the directory directly - see #swapSearcher() - // TODO: why is this package private? - Directory spellIndex; - /** - * Boost value for start and end grams - */ - private float bStart = 2.0f; - - private float bEnd = 1.0f; - // don't use this searcher directly - see #swapSearcher() - - private IndexSearcher searcher; - /* - * this locks all modifications to the current searcher. - */ - - private final Object searcherLock = new Object(); - /* - * this lock synchronizes all possible modifications to the - * current index directory. It should not be possible to try modifying - * the same index concurrently. Note: Do not acquire the searcher lock - * before acquiring this lock! - */ - private final Object modifyCurrentIndexLock = new Object(); - - private volatile boolean closed = false; - // minimum score for hits generated by the spell checker query - - private float accuracy = DEFAULT_ACCURACY; - - private StringDistance sd; - private Comparator comparator; - - /** - * Use the given directory as a spell checker index. The directory - * is created if it doesn't exist yet. - * @param spellIndex the spell index directory - * @param sd the {@link StringDistance} measurement to use - * @throws IOException if Spellchecker can not open the directory - */ - public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException { - this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR); - } - /** - * Use the given directory as a spell checker index with a - * {@link LevensteinDistance} as the default {@link StringDistance}. The - * directory is created if it doesn't exist yet. - * - * @param spellIndex - * the spell index directory - * @throws IOException - * if spellchecker can not open the directory - */ - public SpellChecker(Directory spellIndex) throws IOException { - this(spellIndex, new LevensteinDistance()); - } - - /** - * Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure - * and the given {@link java.util.Comparator} for sorting the results. - * @param spellIndex The spelling index - * @param sd The distance - * @param comparator The comparator - * @throws IOException if there is a problem opening the index - */ - public SpellChecker(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException { - setSpellIndex(spellIndex); - setStringDistance(sd); - this.comparator = comparator; - } - - /** - * Use a different index as the spell checker index or re-open - * the existing index if spellIndex is the same value - * as given in the constructor. - * @param spellIndexDir the spell directory to use - * @throws AlreadyClosedException if the Spellchecker is already closed - * @throws IOException if spellchecker can not open the directory - */ - // TODO: we should make this final as it is called in the constructor - public void setSpellIndex(Directory spellIndexDir) throws IOException { - // this could be the same directory as the current spellIndex - // modifications to the directory should be synchronized - synchronized (modifyCurrentIndexLock) { - ensureOpen(); - if (!IndexReader.indexExists(spellIndexDir)) { - IndexWriter writer = new IndexWriter(spellIndexDir, - new IndexWriterConfig(Version.LUCENE_CURRENT, - new WhitespaceAnalyzer(Version.LUCENE_CURRENT))); - writer.close(); - } - swapSearcher(spellIndexDir); - } - } - - /** - * Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}. - * @param comparator the comparator - */ - public void setComparator(Comparator comparator) { - this.comparator = comparator; - } - - public Comparator getComparator() { - return comparator; - } - - /** - * Sets the {@link StringDistance} implementation for this - * {@link SpellChecker} instance. - * - * @param sd the {@link StringDistance} implementation for this - * {@link SpellChecker} instance - */ - public void setStringDistance(StringDistance sd) { - this.sd = sd; - } - /** - * Returns the {@link StringDistance} instance used by this - * {@link SpellChecker} instance. - * - * @return the {@link StringDistance} instance used by this - * {@link SpellChecker} instance. - */ - public StringDistance getStringDistance() { - return sd; - } - - /** - * Sets the accuracy 0 < minScore < 1; default {@link #DEFAULT_ACCURACY} - * @param acc The new accuracy - */ - public void setAccuracy(float acc) { - this.accuracy = acc; - } - - /** - * The accuracy (minimum score) to be used, unless overridden in {@link #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)}, to - * decide whether a suggestion is included or not. - * @return The current accuracy setting - */ - public float getAccuracy() { - return accuracy; - } - - /** - * Suggest similar words. - * - *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms - * is not the same as the edit distance strategy used to calculate the best - * matching spell-checked word from the hits that Lucene found, one usually has - * to retrieve a couple of numSug's in order to get the true best match. - * - *

I.e. if numSug == 1, don't count on that suggestion being the best one. - * Thus, you should set this value to at least 5 for a good suggestion. - * - * @param word the word you want a spell check done on - * @param numSug the number of suggested words - * @throws IOException if the underlying index throws an {@link IOException} - * @throws AlreadyClosedException if the Spellchecker is already closed - * @return String[] - * - * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) - */ - public String[] suggestSimilar(String word, int numSug) throws IOException { - return this.suggestSimilar(word, numSug, null, null, false); - } - - /** - * Suggest similar words. - * - *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms - * is not the same as the edit distance strategy used to calculate the best - * matching spell-checked word from the hits that Lucene found, one usually has - * to retrieve a couple of numSug's in order to get the true best match. - * - *

I.e. if numSug == 1, don't count on that suggestion being the best one. - * Thus, you should set this value to at least 5 for a good suggestion. - * - * @param word the word you want a spell check done on - * @param numSug the number of suggested words - * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results - * @throws IOException if the underlying index throws an {@link IOException} - * @throws AlreadyClosedException if the Spellchecker is already closed - * @return String[] - * - * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) - */ - public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException { - return this.suggestSimilar(word, numSug, null, null, false, accuracy); - } - - /** - * Suggest similar words (optionally restricted to a field of an index). - * - *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms - * is not the same as the edit distance strategy used to calculate the best - * matching spell-checked word from the hits that Lucene found, one usually has - * to retrieve a couple of numSug's in order to get the true best match. - * - *

I.e. if numSug == 1, don't count on that suggestion being the best one. - * Thus, you should set this value to at least 5 for a good suggestion. - * - *

Uses the {@link #getAccuracy()} value passed into the constructor as the accuracy. - * - * @param word the word you want a spell check done on - * @param numSug the number of suggested words - * @param ir the indexReader of the user index (can be null see field param) - * @param field the field of the user index: if field is not null, the suggested - * words are restricted to the words present in this field. - * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word - * (only if restricted mode = (indexReader!=null and field!=null) - * @throws IOException if the underlying index throws an {@link IOException} - * @throws AlreadyClosedException if the Spellchecker is already closed - * @return String[] the sorted list of the suggest words with these 2 criteria: - * first criteria: the edit distance, second criteria (only if restricted mode): the popularity - * of the suggest words in the field of the user index - * - * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) - */ - public String[] suggestSimilar(String word, int numSug, IndexReader ir, - String field, boolean morePopular) throws IOException { - return suggestSimilar(word, numSug, ir, field, morePopular, accuracy); - } - - - /** - * Suggest similar words (optionally restricted to a field of an index). - * - *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms - * is not the same as the edit distance strategy used to calculate the best - * matching spell-checked word from the hits that Lucene found, one usually has - * to retrieve a couple of numSug's in order to get the true best match. - * - *

I.e. if numSug == 1, don't count on that suggestion being the best one. - * Thus, you should set this value to at least 5 for a good suggestion. - * - * @param word the word you want a spell check done on - * @param numSug the number of suggested words - * @param ir the indexReader of the user index (can be null see field param) - * @param field the field of the user index: if field is not null, the suggested - * words are restricted to the words present in this field. - * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word - * (only if restricted mode = (indexReader!=null and field!=null) - * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results - * @throws IOException if the underlying index throws an {@link IOException} - * @throws AlreadyClosedException if the Spellchecker is already closed - * @return String[] the sorted list of the suggest words with these 2 criteria: - * first criteria: the edit distance, second criteria (only if restricted mode): the popularity - * of the suggest words in the field of the user index - */ - public String[] suggestSimilar(String word, int numSug, IndexReader ir, - String field, boolean morePopular, float accuracy) throws IOException { - // obtainSearcher calls ensureOpen - final IndexSearcher indexSearcher = obtainSearcher(); - try{ - - final int lengthWord = word.length(); - - final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0; - final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0; - // if the word exists in the real index and we don't care for word frequency, return the word itself - if (!morePopular && freq > 0) { - return new String[] { word }; - } - - BooleanQuery query = new BooleanQuery(); - String[] grams; - String key; - - for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) { - - key = "gram" + ng; // form key - - grams = formGrams(word, ng); // form word into ngrams (allow dups too) - - if (grams.length == 0) { - continue; // hmm - } - - if (bStart > 0) { // should we boost prefixes? - add(query, "start" + ng, grams[0], bStart); // matches start of word - - } - if (bEnd > 0) { // should we boost suffixes - add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word - - } - for (int i = 0; i < grams.length; i++) { - add(query, key, grams[i]); - } - } - - int maxHits = 10 * numSug; - - // System.out.println("Q: " + query); - ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs; - // System.out.println("HITS: " + hits.length()); - SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator); - - // go thru more than 'maxr' matches in case the distance filter triggers - int stop = Math.min(hits.length, maxHits); - SuggestWord sugWord = new SuggestWord(); - for (int i = 0; i < stop; i++) { - - sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word - - // don't suggest a word for itself, that would be silly - if (sugWord.string.equals(word)) { - continue; - } - - // edit distance - sugWord.score = sd.getDistance(word,sugWord.string); - if (sugWord.score < accuracy) { - continue; - } - - if (ir != null && field != null) { // use the user index - sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index - // don't suggest a word that is not present in the field - if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) { - continue; - } - } - sugQueue.insertWithOverflow(sugWord); - if (sugQueue.size() == numSug) { - // if queue full, maintain the minScore score - accuracy = sugQueue.top().score; - } - sugWord = new SuggestWord(); - } - - // convert to array string - String[] list = new String[sugQueue.size()]; - for (int i = sugQueue.size() - 1; i >= 0; i--) { - list[i] = sugQueue.pop().string; - } - - return list; - } finally { - releaseSearcher(indexSearcher); - } - } - /** - * Add a clause to a boolean query. - */ - private static void add(BooleanQuery q, String name, String value, float boost) { - Query tq = new TermQuery(new Term(name, value)); - tq.setBoost(boost); - q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD)); - } - - /** - * Add a clause to a boolean query. - */ - private static void add(BooleanQuery q, String name, String value) { - q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD)); - } - - /** - * Form all ngrams for a given word. - * @param text the word to parse - * @param ng the ngram length e.g. 3 - * @return an array of all ngrams in the word and note that duplicates are not removed - */ - private static String[] formGrams(String text, int ng) { - int len = text.length(); - String[] res = new String[len - ng + 1]; - for (int i = 0; i < len - ng + 1; i++) { - res[i] = text.substring(i, i + ng); - } - return res; - } - - /** - * Removes all terms from the spell check index. - * @throws IOException - * @throws AlreadyClosedException if the Spellchecker is already closed - */ - public void clearIndex() throws IOException { - synchronized (modifyCurrentIndexLock) { - ensureOpen(); - final Directory dir = this.spellIndex; - final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( - Version.LUCENE_CURRENT, - new WhitespaceAnalyzer(Version.LUCENE_CURRENT)) - .setOpenMode(OpenMode.CREATE)); - writer.close(); - swapSearcher(dir); - } - } - - /** - * Check whether the word exists in the index. - * @param word - * @throws IOException - * @throws AlreadyClosedException if the Spellchecker is already closed - * @return true if the word exists in the index - */ - public boolean exist(String word) throws IOException { - // obtainSearcher calls ensureOpen - final IndexSearcher indexSearcher = obtainSearcher(); - try{ - return indexSearcher.docFreq(F_WORD_TERM.createTerm(word)) > 0; - } finally { - releaseSearcher(indexSearcher); - } - } - - /** - * Indexes the data from the given {@link Dictionary}. - * @param dict Dictionary to index - * @param mergeFactor mergeFactor to use when indexing - * @param ramMB the max amount or memory in MB to use - * @param optimize whether or not the spellcheck index should be optimized - * @throws AlreadyClosedException if the Spellchecker is already closed - * @throws IOException - */ - public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB, boolean optimize) throws IOException { - synchronized (modifyCurrentIndexLock) { - ensureOpen(); - final Directory dir = this.spellIndex; - final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)).setRAMBufferSizeMB(ramMB)); - ((TieredMergePolicy) writer.getConfig().getMergePolicy()).setMaxMergeAtOnce(mergeFactor); - IndexSearcher indexSearcher = obtainSearcher(); - final List termsEnums = new ArrayList(); - - if (searcher.maxDoc() > 0) { - new ReaderUtil.Gather(searcher.getIndexReader()) { - @Override - protected void add(int base, IndexReader r) throws IOException { - Terms terms = r.terms(F_WORD); - if (terms != null) - termsEnums.add(terms.iterator()); - } - }.run(); - } - - boolean isEmpty = termsEnums.isEmpty(); - - try { - Iterator iter = dict.getWordsIterator(); - BytesRef currentTerm = new BytesRef(); - - terms: while (iter.hasNext()) { - String word = iter.next(); - - int len = word.length(); - if (len < 3) { - continue; // too short we bail but "too long" is fine... - } - - if (!isEmpty) { - // we have a non-empty index, check if the term exists - currentTerm.copy(word); - for (TermsEnum te : termsEnums) { - if (te.seek(currentTerm, false) == TermsEnum.SeekStatus.FOUND) { - continue terms; - } - } - } - - // ok index the word - Document doc = createDocument(word, getMin(len), getMax(len)); - writer.addDocument(doc); - } - } finally { - releaseSearcher(indexSearcher); - } - // close writer - if (optimize) - writer.optimize(); - writer.close(); - // also re-open the spell index to see our own changes when the next suggestion - // is fetched: - swapSearcher(dir); - } - } - - /** - * Indexes the data from the given {@link Dictionary}. - * @param dict the dictionary to index - * @param mergeFactor mergeFactor to use when indexing - * @param ramMB the max amount or memory in MB to use - * @throws IOException - */ - public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException { - indexDictionary(dict, mergeFactor, ramMB, true); - } - - /** - * Indexes the data from the given {@link Dictionary}. - * @param dict the dictionary to index - * @throws IOException - */ - public final void indexDictionary(Dictionary dict) throws IOException { - indexDictionary(dict, 300, (int)IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB); - } - - private static int getMin(int l) { - if (l > 5) { - return 3; - } - if (l == 5) { - return 2; - } - return 1; - } - - private static int getMax(int l) { - if (l > 5) { - return 4; - } - if (l == 5) { - return 3; - } - return 2; - } - - private static Document createDocument(String text, int ng1, int ng2) { - Document doc = new Document(); - // the word field is never queried on... its indexed so it can be quickly - // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos - Field f = new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED); - f.setOmitTermFreqAndPositions(true); - f.setOmitNorms(true); - doc.add(f); // orig term - addGram(text, doc, ng1, ng2); - return doc; - } - - private static void addGram(String text, Document doc, int ng1, int ng2) { - int len = text.length(); - for (int ng = ng1; ng <= ng2; ng++) { - String key = "gram" + ng; - String end = null; - for (int i = 0; i < len - ng + 1; i++) { - String gram = text.substring(i, i + ng); - doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED)); - if (i == 0) { - // only one term possible in the startXXField, TF/pos and norms aren't needed. - Field startField = new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED); - startField.setOmitTermFreqAndPositions(true); - startField.setOmitNorms(true); - doc.add(startField); - } - end = gram; - } - if (end != null) { // may not be present if len==ng1 - // only one term possible in the endXXField, TF/pos and norms aren't needed. - Field endField = new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED); - endField.setOmitTermFreqAndPositions(true); - endField.setOmitNorms(true); - doc.add(endField); - } - } - } - - private IndexSearcher obtainSearcher() { - synchronized (searcherLock) { - ensureOpen(); - searcher.getIndexReader().incRef(); - return searcher; - } - } - - private void releaseSearcher(final IndexSearcher aSearcher) throws IOException{ - // don't check if open - always decRef - // don't decrement the private searcher - could have been swapped - aSearcher.getIndexReader().decRef(); - } - - private void ensureOpen() { - if (closed) { - throw new AlreadyClosedException("Spellchecker has been closed"); - } - } - - /** - * Close the IndexSearcher used by this SpellChecker - * @throws IOException if the close operation causes an {@link IOException} - * @throws AlreadyClosedException if the {@link SpellChecker} is already closed - */ - public void close() throws IOException { - synchronized (searcherLock) { - ensureOpen(); - closed = true; - if (searcher != null) { - searcher.close(); - } - searcher = null; - } - } - - private void swapSearcher(final Directory dir) throws IOException { - /* - * opening a searcher is possibly very expensive. - * We rather close it again if the Spellchecker was closed during - * this operation than block access to the current searcher while opening. - */ - final IndexSearcher indexSearcher = createSearcher(dir); - synchronized (searcherLock) { - if(closed){ - indexSearcher.close(); - throw new AlreadyClosedException("Spellchecker has been closed"); - } - if (searcher != null) { - searcher.close(); - } - // set the spellindex in the sync block - ensure consistency. - searcher = indexSearcher; - this.spellIndex = dir; - } - } - - /** - * Creates a new read-only IndexSearcher - * @param dir the directory used to open the searcher - * @return a new read-only IndexSearcher - * @throws IOException f there is a low-level IO error - */ - // for testing purposes - IndexSearcher createSearcher(final Directory dir) throws IOException{ - return new IndexSearcher(dir, true); - } - - /** - * Returns true if and only if the {@link SpellChecker} is - * closed, otherwise false. - * - * @return true if and only if the {@link SpellChecker} is - * closed, otherwise false. - */ - boolean isClosed(){ - return closed; - } - -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,35 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Interface for string distances. - */ -public interface StringDistance { - - /** - * Returns a float between 0 and 1 based on how similar the specified strings are to one another. - * Returning a value of 1 means the specified strings are identical and 0 means the - * string are maximally different. - * @param s1 The first string. - * @param s2 The second string. - * @return a float between 0 and 1 based on how similar the specified strings are to one another. - */ - public float getDistance(String s1,String s2); - -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,45 +0,0 @@ -package org.apache.lucene.search.spell; - - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * SuggestWord, used in suggestSimilar method in SpellChecker class. - *

- * Default sort is first by score, then by frequency. - * - * - */ -public final class SuggestWord{ - - /** - * the score of the word - */ - public float score; - - /** - * The freq of the word - */ - public int freq; - - /** - * the suggested word - */ - public String string; - -} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,47 +0,0 @@ -package org.apache.lucene.search.spell; - -import java.util.Comparator; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -/** - * Frequency first, then score. Must have - * - **/ -public class SuggestWordFrequencyComparator implements Comparator { - - public int compare(SuggestWord first, SuggestWord second) { - // first criteria: the frequency - if (first.freq > second.freq) { - return 1; - } - if (first.freq < second.freq) { - return -1; - } - - // second criteria (if first criteria is equal): the score - if (first.score > second.score) { - return 1; - } - if (first.score < second.score) { - return -1; - } - // third criteria: term text - return second.string.compareTo(first.string); - } -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,63 +0,0 @@ -package org.apache.lucene.search.spell; - - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.PriorityQueue; - -import java.util.Comparator; - - -/** - * Sorts SuggestWord instances - * - * @see org.apache.lucene.search.spell.SuggestWordScoreComparator - * @see org.apache.lucene.search.spell.SuggestWordFrequencyComparator - * - */ -public final class SuggestWordQueue extends PriorityQueue { - public static final Comparator DEFAULT_COMPARATOR = new SuggestWordScoreComparator(); - - - private Comparator comparator; - - /** - * Use the {@link #DEFAULT_COMPARATOR} - * @param size The size of the queue - */ - public SuggestWordQueue (int size) { - super(size); - comparator = DEFAULT_COMPARATOR; - } - - /** - * Specify the size of the queue and the comparator to use for sorting. - * @param size The size - * @param comparator The comparator. - */ - public SuggestWordQueue(int size, Comparator comparator){ - super(size); - this.comparator = comparator; - } - - @Override - protected final boolean lessThan (SuggestWord wa, SuggestWord wb) { - int val = comparator.compare(wa, wb); - return val < 0; - } -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,47 +0,0 @@ -package org.apache.lucene.search.spell; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Comparator; - - -/** - * Score first, then frequency - * - **/ -public class SuggestWordScoreComparator implements Comparator { - public int compare(SuggestWord first, SuggestWord second) { - // first criteria: the distance - if (first.score > second.score) { - return 1; - } - if (first.score < second.score) { - return -1; - } - - // second criteria (if first criteria is equal): the popularity - if (first.freq > second.freq) { - return 1; - } - - if (first.freq < second.freq) { - return -1; - } - // third criteria: term text - return second.string.compareTo(first.string); - } -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,144 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.Term; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.English; -import org.apache.lucene.util.LuceneTestCase; - -public class TestDirectSpellChecker extends LuceneTestCase { - - public void testSimpleExamples() throws Exception { - DirectSpellChecker spellChecker = new DirectSpellChecker(); - spellChecker.setMinQueryLength(0); - Directory dir = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, dir, - new MockAnalyzer(random, MockTokenizer.SIMPLE, true)); - - for (int i = 0; i < 20; i++) { - Document doc = new Document(); - doc.add(newField("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED)); - writer.addDocument(doc); - } - - IndexReader ir = writer.getReader(); - - SuggestWord[] similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false); - assertTrue(similar.length > 0); - assertEquals("five", similar[0].string); - - similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir, false); - if (similar.length > 0) { - assertFalse(similar[0].string.equals("five")); // don't suggest a word for itself - } - - similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false); - assertTrue(similar.length > 0); - assertEquals("five", similar[0].string); - - similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir, false); - assertTrue(similar.length > 0); - assertEquals("five", similar[0].string); - - similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir, false); - assertTrue(similar.length > 0); - assertEquals("five", similar[0].string); - - assertTrue(similar.length > 0); - similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir, false); - assertEquals("five", similar[0].string); - - // add some more documents - for (int i = 1000; i < 1100; i++) { - Document doc = new Document(); - doc.add(newField("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED)); - writer.addDocument(doc); - } - - ir.close(); - ir = writer.getReader(); - - // look ma, no spellcheck index rebuild - similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10, ir, false); - assertTrue(similar.length > 0); - assertEquals("thousand", similar[0].string); - - ir.close(); - writer.close(); - dir.close(); - } - - public void testOptions() throws Exception { - Directory dir = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, dir, - new MockAnalyzer(random, MockTokenizer.SIMPLE, true)); - - Document doc = new Document(); - doc.add(newField("text", "foobar", Field.Store.NO, Field.Index.ANALYZED)); - writer.addDocument(doc); - doc.add(newField("text", "foobar", Field.Store.NO, Field.Index.ANALYZED)); - writer.addDocument(doc); - doc.add(newField("text", "foobaz", Field.Store.NO, Field.Index.ANALYZED)); - writer.addDocument(doc); - doc.add(newField("text", "fobar", Field.Store.NO, Field.Index.ANALYZED)); - writer.addDocument(doc); - - IndexReader ir = writer.getReader(); - - DirectSpellChecker spellChecker = new DirectSpellChecker(); - spellChecker.setMaxQueryFrequency(0F); - SuggestWord[] similar = spellChecker.suggestSimilar(new Term("text", "fobar"), 1, ir, true); - assertEquals(0, similar.length); - - spellChecker = new DirectSpellChecker(); // reset defaults - spellChecker.setMinQueryLength(5); - similar = spellChecker.suggestSimilar(new Term("text", "foba"), 1, ir, true); - assertEquals(0, similar.length); - - spellChecker = new DirectSpellChecker(); // reset defaults - spellChecker.setMaxEdits(1); - similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true); - assertEquals(0, similar.length); - - spellChecker = new DirectSpellChecker(); // reset defaults - spellChecker.setAccuracy(0.9F); - similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true); - assertEquals(0, similar.length); - - spellChecker = new DirectSpellChecker(); // reset defaults - spellChecker.setMinPrefix(0); - similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true); - assertEquals(1, similar.length); - - spellChecker = new DirectSpellChecker(); // reset defaults - spellChecker.setMinPrefix(1); - similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true); - assertEquals(0, similar.length); - - ir.close(); - writer.close(); - dir.close(); - } -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,49 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.LuceneTestCase; - -public class TestJaroWinklerDistance extends LuceneTestCase { - - private StringDistance sd = new JaroWinklerDistance(); - - public void testGetDistance() { - float d = sd.getDistance("al", "al"); - assertTrue(d == 1.0f); - d = sd.getDistance("martha", "marhta"); - assertTrue(d > 0.961 && d <0.962); - d = sd.getDistance("jones", "johnson"); - assertTrue(d > 0.832 && d < 0.833); - d = sd.getDistance("abcvwxyz", "cabvwxyz"); - assertTrue(d > 0.958 && d < 0.959); - d = sd.getDistance("dwayne", "duane"); - assertTrue(d > 0.84 && d < 0.841); - d = sd.getDistance("dixon", "dicksonx"); - assertTrue(d > 0.813 && d < 0.814); - d = sd.getDistance("fvie", "ten"); - assertTrue(d == 0f); - float d1 = sd.getDistance("zac ephron", "zac efron"); - float d2 = sd.getDistance("zac ephron", "kai ephron"); - assertTrue(d1 > d2); - d1 = sd.getDistance("brittney spears", "britney spears"); - d2 = sd.getDistance("brittney spears", "brittney startzman"); - assertTrue(d1 > d2); - } - -} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,54 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.LuceneTestCase; - -public class TestLevenshteinDistance extends LuceneTestCase { - - private StringDistance sd = new LevensteinDistance(); - - public void testGetDistance() { - float d = sd.getDistance("al", "al"); - assertEquals(d,1.0f,0.001); - d = sd.getDistance("martha", "marhta"); - assertEquals(d,0.6666,0.001); - d = sd.getDistance("jones", "johnson"); - assertEquals(d,0.4285,0.001); - d = sd.getDistance("abcvwxyz", "cabvwxyz"); - assertEquals(d,0.75,0.001); - d = sd.getDistance("dwayne", "duane"); - assertEquals(d,0.666,0.001); - d = sd.getDistance("dixon", "dicksonx"); - assertEquals(d,0.5,0.001); - d = sd.getDistance("six", "ten"); - assertEquals(d,0,0.001); - float d1 = sd.getDistance("zac ephron", "zac efron"); - float d2 = sd.getDistance("zac ephron", "kai ephron"); - assertEquals(d1,d2,0.001); - d1 = sd.getDistance("brittney spears", "britney spears"); - d2 = sd.getDistance("brittney spears", "brittney startzman"); - assertTrue(d1 > d2); - } - - public void testEmpty() throws Exception { - float d = sd.getDistance("", "al"); - assertEquals(d,0.0f,0.001); - } - -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,210 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Iterator; - -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.LuceneTestCase; - -/** - * Test case for LuceneDictionary. - * It first creates a simple index and then a couple of instances of LuceneDictionary - * on different fields and checks if all the right text comes back. - */ -public class TestLuceneDictionary extends LuceneTestCase { - - private Directory store; - - private IndexReader indexReader = null; - private LuceneDictionary ld; - private Iterator it; - - @Override - public void setUp() throws Exception { - super.setUp(); - store = newDirectory(); - IndexWriter writer = new IndexWriter(store, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false))); - - Document doc; - - doc = new Document(); - doc.add(newField("aaa", "foo", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - doc = new Document(); - doc.add(newField("aaa", "foo", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - doc = new Document(); - doc.add(new Field("contents", "Tom", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - doc = new Document(); - doc.add(new Field("contents", "Jerry", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - doc = new Document(); - doc.add(newField("zzz", "bar", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - writer.optimize(); - writer.close(); - } - - @Override - public void tearDown() throws Exception { - if (indexReader != null) - indexReader.close(); - store.close(); - super.tearDown(); - } - - public void testFieldNonExistent() throws IOException { - try { - indexReader = IndexReader.open(store, true); - - ld = new LuceneDictionary(indexReader, "nonexistent_field"); - it = ld.getWordsIterator(); - - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); - } finally { - if (indexReader != null) { indexReader.close(); } - } - } - - public void testFieldAaa() throws IOException { - try { - indexReader = IndexReader.open(store, true); - - ld = new LuceneDictionary(indexReader, "aaa"); - it = ld.getWordsIterator(); - - assertTrue("First element doesn't exist.", it.hasNext()); - assertTrue("First element isn't correct", it.next().equals("foo")); - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); - } finally { - if (indexReader != null) { indexReader.close(); } - } - } - - public void testFieldContents_1() throws IOException { - try { - indexReader = IndexReader.open(store, true); - - ld = new LuceneDictionary(indexReader, "contents"); - it = ld.getWordsIterator(); - - assertTrue("First element doesn't exist.", it.hasNext()); - assertTrue("First element isn't correct", it.next().equals("Jerry")); - assertTrue("Second element doesn't exist.", it.hasNext()); - assertTrue("Second element isn't correct", it.next().equals("Tom")); - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); - - ld = new LuceneDictionary(indexReader, "contents"); - it = ld.getWordsIterator(); - - int counter = 2; - while (it.hasNext()) { - it.next(); - counter--; - } - - assertTrue("Number of words incorrect", counter == 0); - } - finally { - if (indexReader != null) { indexReader.close(); } - } - } - - public void testFieldContents_2() throws IOException { - try { - indexReader = IndexReader.open(store, true); - - ld = new LuceneDictionary(indexReader, "contents"); - it = ld.getWordsIterator(); - - // hasNext() should have no side effects - assertTrue("First element isn't were it should be.", it.hasNext()); - assertTrue("First element isn't were it should be.", it.hasNext()); - assertTrue("First element isn't were it should be.", it.hasNext()); - - // just iterate through words - assertTrue("First element isn't correct", it.next().equals("Jerry")); - assertTrue("Second element isn't correct", it.next().equals("Tom")); - assertTrue("Nonexistent element is really null", it.next() == null); - - // hasNext() should still have no side effects ... - assertFalse("There should be any more elements", it.hasNext()); - assertFalse("There should be any more elements", it.hasNext()); - assertFalse("There should be any more elements", it.hasNext()); - - // .. and there are really no more words - assertTrue("Nonexistent element is really null", it.next() == null); - assertTrue("Nonexistent element is really null", it.next() == null); - assertTrue("Nonexistent element is really null", it.next() == null); - } - finally { - if (indexReader != null) { indexReader.close(); } - } - } - - public void testFieldZzz() throws IOException { - try { - indexReader = IndexReader.open(store, true); - - ld = new LuceneDictionary(indexReader, "zzz"); - it = ld.getWordsIterator(); - - assertTrue("First element doesn't exist.", it.hasNext()); - assertTrue("First element isn't correct", it.next().equals("bar")); - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); - } - finally { - if (indexReader != null) { indexReader.close(); } - } - } - - public void testSpellchecker() throws IOException { - Directory dir = newDirectory(); - SpellChecker sc = new SpellChecker(dir); - indexReader = IndexReader.open(store, true); - sc.indexDictionary(new LuceneDictionary(indexReader, "contents")); - String[] suggestions = sc.suggestSimilar("Tam", 1); - assertEquals(1, suggestions.length); - assertEquals("Tom", suggestions[0]); - suggestions = sc.suggestSimilar("Jarry", 1); - assertEquals(1, suggestions.length); - assertEquals("Jerry", suggestions[0]); - indexReader.close(); - sc.close(); - dir.close(); - } - -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,132 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.LuceneTestCase; - -public class TestNGramDistance extends LuceneTestCase { - - - - public void testGetDistance1() { - StringDistance nsd = new NGramDistance(1); - float d = nsd.getDistance("al", "al"); - assertEquals(d,1.0f,0.001); - d = nsd.getDistance("a", "a"); - assertEquals(d,1.0f,0.001); - d = nsd.getDistance("b", "a"); - assertEquals(d,0.0f,0.001); - d = nsd.getDistance("martha", "marhta"); - assertEquals(d,0.6666,0.001); - d = nsd.getDistance("jones", "johnson"); - assertEquals(d,0.4285,0.001); - d = nsd.getDistance("natural", "contrary"); - assertEquals(d,0.25,0.001); - d = nsd.getDistance("abcvwxyz", "cabvwxyz"); - assertEquals(d,0.75,0.001); - d = nsd.getDistance("dwayne", "duane"); - assertEquals(d,0.666,0.001); - d = nsd.getDistance("dixon", "dicksonx"); - assertEquals(d,0.5,0.001); - d = nsd.getDistance("six", "ten"); - assertEquals(d,0,0.001); - float d1 = nsd.getDistance("zac ephron", "zac efron"); - float d2 = nsd.getDistance("zac ephron", "kai ephron"); - assertEquals(d1,d2,0.001); - d1 = nsd.getDistance("brittney spears", "britney spears"); - d2 = nsd.getDistance("brittney spears", "brittney startzman"); - assertTrue(d1 > d2); - d1 = nsd.getDistance("12345678", "12890678"); - d2 = nsd.getDistance("12345678", "72385698"); - assertEquals(d1,d2,001); - } - - public void testGetDistance2() { - StringDistance sd = new NGramDistance(2); - float d = sd.getDistance("al", "al"); - assertEquals(d,1.0f,0.001); - d = sd.getDistance("a", "a"); - assertEquals(d,1.0f,0.001); - d = sd.getDistance("b", "a"); - assertEquals(d,0.0f,0.001); - d = sd.getDistance("a", "aa"); - assertEquals(d,0.5f,0.001); - d = sd.getDistance("martha", "marhta"); - assertEquals(d,0.6666,0.001); - d = sd.getDistance("jones", "johnson"); - assertEquals(d,0.4285,0.001); - d = sd.getDistance("natural", "contrary"); - assertEquals(d,0.25,0.001); - d = sd.getDistance("abcvwxyz", "cabvwxyz"); - assertEquals(d,0.625,0.001); - d = sd.getDistance("dwayne", "duane"); - assertEquals(d,0.5833,0.001); - d = sd.getDistance("dixon", "dicksonx"); - assertEquals(d,0.5,0.001); - d = sd.getDistance("six", "ten"); - assertEquals(d,0,0.001); - float d1 = sd.getDistance("zac ephron", "zac efron"); - float d2 = sd.getDistance("zac ephron", "kai ephron"); - assertTrue(d1 > d2); - d1 = sd.getDistance("brittney spears", "britney spears"); - d2 = sd.getDistance("brittney spears", "brittney startzman"); - assertTrue(d1 > d2); - d1 = sd.getDistance("0012345678", "0012890678"); - d2 = sd.getDistance("0012345678", "0072385698"); - assertEquals(d1,d2,0.001); - } - - public void testGetDistance3() { - StringDistance sd = new NGramDistance(3); - float d = sd.getDistance("al", "al"); - assertEquals(d,1.0f,0.001); - d = sd.getDistance("a", "a"); - assertEquals(d,1.0f,0.001); - d = sd.getDistance("b", "a"); - assertEquals(d,0.0f,0.001); - d = sd.getDistance("martha", "marhta"); - assertEquals(d,0.7222,0.001); - d = sd.getDistance("jones", "johnson"); - assertEquals(d,0.4762,0.001); - d = sd.getDistance("natural", "contrary"); - assertEquals(d,0.2083,0.001); - d = sd.getDistance("abcvwxyz", "cabvwxyz"); - assertEquals(d,0.5625,0.001); - d = sd.getDistance("dwayne", "duane"); - assertEquals(d,0.5277,0.001); - d = sd.getDistance("dixon", "dicksonx"); - assertEquals(d,0.4583,0.001); - d = sd.getDistance("six", "ten"); - assertEquals(d,0,0.001); - float d1 = sd.getDistance("zac ephron", "zac efron"); - float d2 = sd.getDistance("zac ephron", "kai ephron"); - assertTrue(d1 > d2); - d1 = sd.getDistance("brittney spears", "britney spears"); - d2 = sd.getDistance("brittney spears", "brittney startzman"); - assertTrue(d1 > d2); - d1 = sd.getDistance("0012345678", "0012890678"); - d2 = sd.getDistance("0012345678", "0072385698"); - assertTrue(d1 < d2); - } - - public void testEmpty() throws Exception { - StringDistance nsd = new NGramDistance(1); - float d = nsd.getDistance("", "al"); - assertEquals(d,0.0f,0.001); - } -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,47 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; - -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.LuceneTestCase; - -/** - * Test case for PlainTextDictionary - * - */ -public class TestPlainTextDictionary extends LuceneTestCase { - - public void testBuild() throws IOException { - final String LF = System.getProperty("line.separator"); - String input = "oneword" + LF + "twoword" + LF + "threeword"; - PlainTextDictionary ptd = new PlainTextDictionary(new StringReader(input)); - Directory ramDir = newDirectory(); - SpellChecker spellChecker = new SpellChecker(ramDir); - spellChecker.indexDictionary(ptd); - String[] similar = spellChecker.suggestSimilar("treeword", 2); - assertEquals(2, similar.length); - assertEquals(similar[0], "threeword"); - assertEquals(similar[1], "oneword"); - spellChecker.close(); - ramDir.close(); - } - -} diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java --- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java 2011-05-22 12:38:16.000000000 -0400 +++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,438 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; - -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.store.AlreadyClosedException; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.English; -import org.apache.lucene.util.LuceneTestCase; - -/** - * Spell checker test case - */ -public class TestSpellChecker extends LuceneTestCase { - private SpellCheckerMock spellChecker; - private Directory userindex, spellindex; - private List searchers; - - @Override - public void setUp() throws Exception { - super.setUp(); - - //create a user index - userindex = newDirectory(); - IndexWriter writer = new IndexWriter(userindex, new IndexWriterConfig( - TEST_VERSION_CURRENT, new MockAnalyzer(random))); - - for (int i = 0; i < 1000; i++) { - Document doc = new Document(); - doc.add(newField("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED)); - doc.add(newField("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand - doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand - writer.addDocument(doc); - } - writer.close(); - searchers = Collections.synchronizedList(new ArrayList()); - // create the spellChecker - spellindex = newDirectory(); - spellChecker = new SpellCheckerMock(spellindex); - } - - @Override - public void tearDown() throws Exception { - userindex.close(); - if (!spellChecker.isClosed()) - spellChecker.close(); - spellindex.close(); - super.tearDown(); - } - - - public void testBuild() throws CorruptIndexException, IOException { - IndexReader r = IndexReader.open(userindex, true); - - spellChecker.clearIndex(); - - addwords(r, spellChecker, "field1"); - int num_field1 = this.numdoc(); - - addwords(r, spellChecker, "field2"); - int num_field2 = this.numdoc(); - - assertEquals(num_field2, num_field1 + 1); - - assertLastSearcherOpen(4); - - checkCommonSuggestions(r); - checkLevenshteinSuggestions(r); - - spellChecker.setStringDistance(new JaroWinklerDistance()); - spellChecker.setAccuracy(0.8f); - checkCommonSuggestions(r); - checkJaroWinklerSuggestions(); - // the accuracy is set to 0.8 by default, but the best result has a score of 0.925 - String[] similar = spellChecker.suggestSimilar("fvie", 2, 0.93f); - assertTrue(similar.length == 0); - similar = spellChecker.suggestSimilar("fvie", 2, 0.92f); - assertTrue(similar.length == 1); - - similar = spellChecker.suggestSimilar("fiv", 2); - assertTrue(similar.length > 0); - assertEquals(similar[0], "five"); - - spellChecker.setStringDistance(new NGramDistance(2)); - spellChecker.setAccuracy(0.5f); - checkCommonSuggestions(r); - checkNGramSuggestions(); - - r.close(); - } - - public void testComparator() throws Exception { - IndexReader r = IndexReader.open(userindex, true); - Directory compIdx = newDirectory(); - SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator()); - addwords(r, compareSP, "field3"); - - String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", false); - assertTrue(similar.length == 2); - //five and fvei have the same score, but different frequencies. - assertEquals("fvei", similar[0]); - assertEquals("five", similar[1]); - r.close(); - if (!compareSP.isClosed()) - compareSP.close(); - compIdx.close(); - } - - private void checkCommonSuggestions(IndexReader r) throws IOException { - String[] similar = spellChecker.suggestSimilar("fvie", 2); - assertTrue(similar.length > 0); - assertEquals(similar[0], "five"); - - similar = spellChecker.suggestSimilar("five", 2); - if (similar.length > 0) { - assertFalse(similar[0].equals("five")); // don't suggest a word for itself - } - - similar = spellChecker.suggestSimilar("fiv", 2); - assertTrue(similar.length > 0); - assertEquals(similar[0], "five"); - - similar = spellChecker.suggestSimilar("fives", 2); - assertTrue(similar.length > 0); - assertEquals(similar[0], "five"); - - assertTrue(similar.length > 0); - similar = spellChecker.suggestSimilar("fie", 2); - assertEquals(similar[0], "five"); - - // test restraint to a field - similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false); - assertEquals(0, similar.length); // there isn't the term thousand in the field field1 - - similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false); - assertEquals(1, similar.length); // there is the term thousand in the field field2 - } - - private void checkLevenshteinSuggestions(IndexReader r) throws IOException { - // test small word - String[] similar = spellChecker.suggestSimilar("fvie", 2); - assertEquals(1, similar.length); - assertEquals(similar[0], "five"); - - similar = spellChecker.suggestSimilar("five", 2); - assertEquals(1, similar.length); - assertEquals(similar[0], "nine"); // don't suggest a word for itself - - similar = spellChecker.suggestSimilar("fiv", 2); - assertEquals(1, similar.length); - assertEquals(similar[0], "five"); - - similar = spellChecker.suggestSimilar("ive", 2); - assertEquals(2, similar.length); - assertEquals(similar[0], "five"); - assertEquals(similar[1], "nine"); - - similar = spellChecker.suggestSimilar("fives", 2); - assertEquals(1, similar.length); - assertEquals(similar[0], "five"); - - similar = spellChecker.suggestSimilar("fie", 2); - assertEquals(2, similar.length); - assertEquals(similar[0], "five"); - assertEquals(similar[1], "nine"); - - similar = spellChecker.suggestSimilar("fi", 2); - assertEquals(1, similar.length); - assertEquals(similar[0], "five"); - - // test restraint to a field - similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false); - assertEquals(0, similar.length); // there isn't the term thousand in the field field1 - - similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false); - assertEquals(1, similar.length); // there is the term thousand in the field field2 - - similar = spellChecker.suggestSimilar("onety", 2); - assertEquals(2, similar.length); - assertEquals(similar[0], "ninety"); - assertEquals(similar[1], "one"); - try { - similar = spellChecker.suggestSimilar("tousand", 10, r, null, false); - } catch (NullPointerException e) { - assertTrue("threw an NPE, and it shouldn't have", false); - } - } - - private void checkJaroWinklerSuggestions() throws IOException { - String[] similar = spellChecker.suggestSimilar("onety", 2); - assertEquals(2, similar.length); - assertEquals(similar[0], "one"); - assertEquals(similar[1], "ninety"); - } - - private void checkNGramSuggestions() throws IOException { - String[] similar = spellChecker.suggestSimilar("onety", 2); - assertEquals(2, similar.length); - assertEquals(similar[0], "one"); - assertEquals(similar[1], "ninety"); - } - - private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException { - long time = System.currentTimeMillis(); - sc.indexDictionary(new LuceneDictionary(r, field)); - time = System.currentTimeMillis() - time; - //System.out.println("time to build " + field + ": " + time); - } - - private int numdoc() throws IOException { - IndexReader rs = IndexReader.open(spellindex, true); - int num = rs.numDocs(); - assertTrue(num != 0); - //System.out.println("num docs: " + num); - rs.close(); - return num; - } - - public void testClose() throws IOException { - IndexReader r = IndexReader.open(userindex, true); - spellChecker.clearIndex(); - String field = "field1"; - addwords(r, spellChecker, "field1"); - int num_field1 = this.numdoc(); - addwords(r, spellChecker, "field2"); - int num_field2 = this.numdoc(); - assertEquals(num_field2, num_field1 + 1); - checkCommonSuggestions(r); - assertLastSearcherOpen(4); - spellChecker.close(); - assertSearchersClosed(); - try { - spellChecker.close(); - fail("spellchecker was already closed"); - } catch (AlreadyClosedException e) { - // expected - } - try { - checkCommonSuggestions(r); - fail("spellchecker was already closed"); - } catch (AlreadyClosedException e) { - // expected - } - - try { - spellChecker.clearIndex(); - fail("spellchecker was already closed"); - } catch (AlreadyClosedException e) { - // expected - } - - try { - spellChecker.indexDictionary(new LuceneDictionary(r, field)); - fail("spellchecker was already closed"); - } catch (AlreadyClosedException e) { - // expected - } - - try { - spellChecker.setSpellIndex(spellindex); - fail("spellchecker was already closed"); - } catch (AlreadyClosedException e) { - // expected - } - assertEquals(4, searchers.size()); - assertSearchersClosed(); - r.close(); - } - - /* - * tests if the internally shared indexsearcher is correctly closed - * when the spellchecker is concurrently accessed and closed. - */ - public void testConcurrentAccess() throws IOException, InterruptedException { - assertEquals(1, searchers.size()); - final IndexReader r = IndexReader.open(userindex, true); - spellChecker.clearIndex(); - assertEquals(2, searchers.size()); - addwords(r, spellChecker, "field1"); - assertEquals(3, searchers.size()); - int num_field1 = this.numdoc(); - addwords(r, spellChecker, "field2"); - assertEquals(4, searchers.size()); - int num_field2 = this.numdoc(); - assertEquals(num_field2, num_field1 + 1); - int numThreads = 5 + this.random.nextInt(5); - ExecutorService executor = Executors.newFixedThreadPool(numThreads); - SpellCheckWorker[] workers = new SpellCheckWorker[numThreads]; - for (int i = 0; i < numThreads; i++) { - SpellCheckWorker spellCheckWorker = new SpellCheckWorker(r); - executor.execute(spellCheckWorker); - workers[i] = spellCheckWorker; - - } - int iterations = 5 + random.nextInt(5); - for (int i = 0; i < iterations; i++) { - Thread.sleep(100); - // concurrently reset the spell index - spellChecker.setSpellIndex(this.spellindex); - // for debug - prints the internal open searchers - // showSearchersOpen(); - } - - spellChecker.close(); - executor.shutdown(); - // wait for 60 seconds - usually this is very fast but coverage runs could take quite long - executor.awaitTermination(60L, TimeUnit.SECONDS); - - for (int i = 0; i < workers.length; i++) { - assertFalse(String.format("worker thread %d failed", i), workers[i].failed); - assertTrue(String.format("worker thread %d is still running but should be terminated", i), workers[i].terminated); - } - // 4 searchers more than iterations - // 1. at creation - // 2. clearIndex() - // 2. and 3. during addwords - assertEquals(iterations + 4, searchers.size()); - assertSearchersClosed(); - r.close(); - } - - private void assertLastSearcherOpen(int numSearchers) { - assertEquals(numSearchers, searchers.size()); - IndexSearcher[] searcherArray = searchers.toArray(new IndexSearcher[0]); - for (int i = 0; i < searcherArray.length; i++) { - if (i == searcherArray.length - 1) { - assertTrue("expected last searcher open but was closed", - searcherArray[i].getIndexReader().getRefCount() > 0); - } else { - assertFalse("expected closed searcher but was open - Index: " + i, - searcherArray[i].getIndexReader().getRefCount() > 0); - } - } - } - - private void assertSearchersClosed() { - for (IndexSearcher searcher : searchers) { - assertEquals(0, searcher.getIndexReader().getRefCount()); - } - } - - // For debug -// private void showSearchersOpen() { -// int count = 0; -// for (IndexSearcher searcher : searchers) { -// if(searcher.getIndexReader().getRefCount() > 0) -// ++count; -// } -// System.out.println(count); -// } - - - private class SpellCheckWorker implements Runnable { - private final IndexReader reader; - volatile boolean terminated = false; - volatile boolean failed = false; - - SpellCheckWorker(IndexReader reader) { - super(); - this.reader = reader; - } - - public void run() { - try { - while (true) { - try { - checkCommonSuggestions(reader); - } catch (AlreadyClosedException e) { - - return; - } catch (Throwable e) { - - e.printStackTrace(); - failed = true; - return; - } - } - } finally { - terminated = true; - } - } - - } - - class SpellCheckerMock extends SpellChecker { - public SpellCheckerMock(Directory spellIndex) throws IOException { - super(spellIndex); - } - - public SpellCheckerMock(Directory spellIndex, StringDistance sd) - throws IOException { - super(spellIndex, sd); - } - - public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException { - super(spellIndex, sd, comparator); - } - - @Override - IndexSearcher createSearcher(Directory dir) throws IOException { - IndexSearcher searcher = super.createSearcher(dir); - TestSpellChecker.this.searchers.add(searcher); - return searcher; - } - } - -} diff -ruN -x .svn -x build lucene-clean-trunk/modules/build.xml lucene-trunk/modules/build.xml --- lucene-clean-trunk/modules/build.xml 2011-05-22 12:38:11.000000000 -0400 +++ lucene-trunk/modules/build.xml 2011-05-22 19:07:14.000000000 -0400 @@ -25,6 +25,7 @@ + @@ -35,6 +36,7 @@ + @@ -45,6 +47,7 @@ + @@ -55,6 +58,7 @@ + @@ -66,6 +70,7 @@ + @@ -96,6 +101,7 @@ + diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/build.xml lucene-trunk/modules/suggest/build.xml --- lucene-clean-trunk/modules/suggest/build.xml 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/build.xml 2011-05-22 18:58:21.000000000 -0400 @@ -0,0 +1,47 @@ + + + + + + + + Suggest + + + + + + + + + + + + + + + + + + + + + + + diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,35 @@ +package org.apache.lucene.search.spell; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Iterator; + +/** + * A simple interface representing a Dictionary. A Dictionary + * here is just a list of words. + * + * + * @version 1.0 + */ +public interface Dictionary { + + /** + * Return all words present in the dictionary + * @return Iterator + */ + Iterator getWordsIterator(); +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,487 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Locale; +import java.util.PriorityQueue; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FuzzyTermsEnum; +import org.apache.lucene.search.BoostAttribute; +import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.LevenshteinAutomata; + +/** + * Simple automaton-based spellchecker. + *

+ * Candidates are presented directly from the term dictionary, based on + * Levenshtein distance. This is an alternative to {@link SpellChecker} + * if you are using an edit-distance-like metric such as Levenshtein + * or {@link JaroWinklerDistance}. + *

+ * A practical benefit of this spellchecker is that it requires no additional + * datastructures (neither in RAM nor on disk) to do its work. + * + * @see LevenshteinAutomata + * @see FuzzyTermsEnum + * + * @lucene.experimental + */ +public class DirectSpellChecker { + /** The default StringDistance, Levenshtein distance implemented internally + * via {@link LevenshteinAutomata}. + *

+ * Note: this is the fastest distance metric, because Levenshtein is used + * to draw candidates from the term dictionary: this just re-uses the scoring. + *

+ * Note also that this metric differs in subtle ways from {@link LevensteinDistance}: + *

    + *
  • This metric treats full unicode codepoints as characters, but + * LevenshteinDistance calculates based on UTF-16 code units. + *
  • This metric scales raw edit distances into a floating point score + * differently than LevenshteinDistance: the scaling is based upon the + * shortest of the two terms instead of the longest. + *
+ */ + public static final StringDistance INTERNAL_LEVENSHTEIN = new StringDistance() { + public float getDistance(String s1, String s2) { + throw new UnsupportedOperationException("Not for external use."); + }}; + + /** maximum edit distance for candidate terms */ + private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; + /** minimum prefix for candidate terms */ + private int minPrefix = 1; + /** maximum number of top-N inspections per suggestion */ + private int maxInspections = 5; + /** minimum accuracy for a term to match */ + private float accuracy = SpellChecker.DEFAULT_ACCURACY; + /** value in [0..1] (or absolute number >=1) representing the minimum + * number of documents (of the total) where a term should appear. */ + private float thresholdFrequency = 0f; + /** minimum length of a query word to return suggestions */ + private int minQueryLength = 4; + /** value in [0..1] (or absolute number >=1) representing the maximum + * number of documents (of the total) a query term can appear in to + * be corrected. */ + private float maxQueryFrequency = 0.01f; + /** true if the spellchecker should lowercase terms */ + private boolean lowerCaseTerms = true; + /** the comparator to use */ + private Comparator comparator = SuggestWordQueue.DEFAULT_COMPARATOR; + /** the string distance to use */ + private StringDistance distance = INTERNAL_LEVENSHTEIN; + + /** Get the maximum number of Levenshtein edit-distances to draw + * candidate terms from. */ + public int getMaxEdits() { + return maxEdits; + } + + /** Sets the maximum number of Levenshtein edit-distances to draw + * candidate terms from. This value can be 1 or 2. The default is 2. + *

+ * Note: a large number of spelling errors occur with an edit distance + * of 1, by setting this value to 1 you can increase both performance + * and precision at the cost of recall. + */ + public void setMaxEdits(int maxEdits) { + if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) + throw new UnsupportedOperationException("Invalid maxEdits"); + this.maxEdits = maxEdits; + } + + /** + * Get the minimal number of characters that must match exactly + */ + public int getMinPrefix() { + return minPrefix; + } + + /** + * Sets the minimal number of initial characters (default: 1) + * that must match exactly. + *

+ * This can improve both performance and accuracy of results, + * as misspellings are commonly not the first character. + */ + public void setMinPrefix(int minPrefix) { + this.minPrefix = minPrefix; + } + + /** + * Get the maximum number of top-N inspections per suggestion + */ + public int getMaxInspections() { + return maxInspections; + } + + /** + * Set the maximum number of top-N inspections (default: 5) per suggestion. + *

+ * Increasing this number can improve the accuracy of results, at the cost + * of performance. + */ + public void setMaxInspections(int maxInspections) { + this.maxInspections = maxInspections; + } + + /** + * Get the minimal accuracy from the StringDistance for a match + */ + public float getAccuracy() { + return accuracy; + } + + /** + * Set the minimal accuracy required (default: 0.5f) from a StringDistance + * for a suggestion match. + */ + public void setAccuracy(float accuracy) { + this.accuracy = accuracy; + } + + /** + * Get the minimal threshold of documents a term must appear for a match + */ + public float getThresholdFrequency() { + return thresholdFrequency; + } + + /** + * Set the minimal threshold of documents a term must appear for a match. + *

+ * This can improve quality by only suggesting high-frequency terms. Note that + * very high values might decrease performance slightly, by forcing the spellchecker + * to draw more candidates from the term dictionary, but a practical value such + * as 1 can be very useful towards improving quality. + *

+ * This can be specified as a relative percentage of documents such as 0.5f, + * or it can be specified as an absolute whole document frequency, such as 4f. + * Absolute document frequencies may not be fractional. + */ + public void setThresholdFrequency(float thresholdFrequency) { + if (thresholdFrequency >= 1f && thresholdFrequency != (int) thresholdFrequency) + throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed"); + this.thresholdFrequency = thresholdFrequency; + } + + /** Get the minimum length of a query term needed to return suggestions */ + public int getMinQueryLength() { + return minQueryLength; + } + + /** + * Set the minimum length of a query term (default: 4) needed to return suggestions. + *

+ * Very short query terms will often cause only bad suggestions with any distance + * metric. + */ + public void setMinQueryLength(int minQueryLength) { + this.minQueryLength = minQueryLength; + } + + /** + * Get the maximum threshold of documents a query term can appear in order + * to provide suggestions. + */ + public float getMaxQueryFrequency() { + return maxQueryFrequency; + } + + /** + * Set the maximum threshold (default: 0.01f) of documents a query term can + * appear in order to provide suggestions. + *

+ * Very high-frequency terms are typically spelled correctly. Additionally, + * this can increase performance as it will do no work for the common case + * of correctly-spelled input terms. + *

+ * This can be specified as a relative percentage of documents such as 0.5f, + * or it can be specified as an absolute whole document frequency, such as 4f. + * Absolute document frequencies may not be fractional. + */ + public void setMaxQueryFrequency(float maxQueryFrequency) { + if (maxQueryFrequency >= 1f && maxQueryFrequency != (int) maxQueryFrequency) + throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed"); + this.maxQueryFrequency = maxQueryFrequency; + } + + /** true if the spellchecker should lowercase terms */ + public boolean getLowerCaseTerms() { + return lowerCaseTerms; + } + + /** + * True if the spellchecker should lowercase terms (default: true) + *

+ * This is a convenience method, if your index field has more complicated + * analysis (such as StandardTokenizer removing punctuation), its probably + * better to turn this off, and instead run your query terms through your + * Analyzer first. + *

+ * If this option is not on, case differences count as an edit! + */ + public void setLowerCaseTerms(boolean lowerCaseTerms) { + this.lowerCaseTerms = lowerCaseTerms; + } + + /** + * Get the current comparator in use. + */ + public Comparator getComparator() { + return comparator; + } + + /** + * Set the comparator for sorting suggestions. + * The default is {@link SuggestWordQueue#DEFAULT_COMPARATOR} + */ + public void setComparator(Comparator comparator) { + this.comparator = comparator; + } + + /** + * Get the string distance metric in use. + */ + public StringDistance getDistance() { + return distance; + } + + /** + * Set the string distance metric. + * The default is {@link #INTERNAL_LEVENSHTEIN} + *

+ * Note: because this spellchecker draws its candidates from the + * term dictionary using Levenshtein, it works best with an edit-distance-like + * string metric. If you use a different metric than the default, + * you might want to consider increasing {@link #setMaxInspections(int)} + * to draw more candidates for your metric to rank. + */ + public void setDistance(StringDistance distance) { + this.distance = distance; + } + + /** + * Calls {@link #suggestSimilar(Term, int, IndexReader, boolean) + * suggestSimilar(term, numSug, ir, false)} + */ + public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir) + throws IOException { + return suggestSimilar(term, numSug, ir, false); + } + + /** + * Calls {@link #suggestSimilar(Term, int, IndexReader, boolean, float) + * suggestSimilar(term, numSug, ir, morePopular, this.accuracy)} + */ + public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, + boolean morePopular) throws IOException { + return suggestSimilar(term, numSug, ir, morePopular, accuracy); + } + + /** + * Suggest similar words. + * + *

Unlike {@link SpellChecker}, the similarity used to fetch the most + * relevant terms is an edit distance, therefore typically a low value + * for numSug will work very well. + * + * @param term Term you want to spell check on + * @param numSug the maximum number of suggested words + * @param ir IndexReader to find terms from + * @param morePopular return only suggested words that are as frequent or more frequent than the searched word + * @param accuracy return only suggested words that match with this similarity + * @return sorted list of the suggested words according to the comparator + * @throws IOException + */ + public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, + boolean morePopular, float accuracy) throws IOException { + + String text = term.text(); + if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength) + return new SuggestWord[0]; + + if (lowerCaseTerms) + term = term.createTerm(text.toLowerCase(Locale.ENGLISH)); + + int docfreq = ir.docFreq(term); + + // see line 341 of spellchecker. this is certainly very very nice for perf, + // but is it really the right way to go? + if (!morePopular && docfreq > 0) { + return new SuggestWord[0]; + } + + int maxDoc = ir.maxDoc(); + + if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) { + return new SuggestWord[0]; + } else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float)maxDoc)) { + return new SuggestWord[0]; + } + + if (!morePopular) docfreq = 0; + + if (thresholdFrequency >= 1f) { + docfreq = Math.max(docfreq, (int) thresholdFrequency); + } else if (thresholdFrequency > 0f) { + docfreq = Math.max(docfreq, (int)(thresholdFrequency * (float)maxDoc)-1); + } + + Collection terms = null; + int inspections = numSug * maxInspections; + + // try ed=1 first, in case we get lucky + terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy); + if (maxEdits > 1 && terms.size() < inspections) { + HashSet moreTerms = new HashSet(); + moreTerms.addAll(terms); + moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy)); + terms = moreTerms; + } + + // create the suggestword response, sort it, and trim it to size. + + SuggestWord suggestions[] = new SuggestWord[terms.size()]; + int index = suggestions.length - 1; + for (ScoreTerm s : terms) { + SuggestWord suggestion = new SuggestWord(); + suggestion.string = s.termAsString != null ? s.termAsString : s.term.utf8ToString(); + suggestion.score = s.score; + suggestion.freq = s.docfreq; + suggestions[index--] = suggestion; + } + + ArrayUtil.mergeSort(suggestions, Collections.reverseOrder(comparator)); + if (numSug < suggestions.length) { + SuggestWord trimmed[] = new SuggestWord[numSug]; + System.arraycopy(suggestions, 0, trimmed, 0, numSug); + suggestions = trimmed; + } + return suggestions; + } + + private Collection suggestSimilar(Term term, int numSug, + IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException { + + AttributeSource atts = new AttributeSource(); + MaxNonCompetitiveBoostAttribute maxBoostAtt = + atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); + FuzzyTermsEnum e = new FuzzyTermsEnum(MultiFields.getTerms(ir, term.field()).iterator(), atts, term, editDistance, Math.max(minPrefix, editDistance-1)); + final PriorityQueue stQueue = new PriorityQueue(); + + BytesRef queryTerm = new BytesRef(term.text()); + BytesRef candidateTerm; + ScoreTerm st = new ScoreTerm(); + BoostAttribute boostAtt = + e.attributes().addAttribute(BoostAttribute.class); + while ((candidateTerm = e.next()) != null) { + final float boost = boostAtt.getBoost(); + // ignore uncompetitive hits + if (stQueue.size() >= numSug && boost <= stQueue.peek().boost) + continue; + + // ignore exact match of the same term + if (queryTerm.bytesEquals(candidateTerm)) + continue; + + int df = e.docFreq(); + + // check docFreq if required + if (df <= docfreq) + continue; + + final float score; + final String termAsString; + if (distance == INTERNAL_LEVENSHTEIN) { + // delay creating strings until the end + termAsString = null; + // undo FuzzyTermsEnum's scale factor for a real scaled lev score + score = boost / e.getScaleFactor() + e.getMinSimilarity(); + } else { + termAsString = candidateTerm.utf8ToString(); + score = distance.getDistance(term.text(), termAsString); + } + + if (score < accuracy) + continue; + + // add new entry in PQ + st.term = new BytesRef(candidateTerm); + st.boost = boost; + st.docfreq = df; + st.termAsString = termAsString; + st.score = score; + stQueue.offer(st); + // possibly drop entries from queue + st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); + maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); + } + + return stQueue; + } + + private static class ScoreTerm implements Comparable { + public BytesRef term; + public float boost; + public int docfreq; + + public String termAsString; + public float score; + + public int compareTo(ScoreTerm other) { + if (term.bytesEquals(other.term)) + return 0; // consistent with equals + if (this.boost == other.boost) + return other.term.compareTo(this.term); + else + return Float.compare(this.boost, other.boost); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + ScoreTerm other = (ScoreTerm) obj; + if (term == null) { + if (other.term != null) return false; + } else if (!term.bytesEquals(other.term)) return false; + return true; + } + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java 2011-05-22 19:00:10.000000000 -0400 @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.spell; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.search.spell.Dictionary; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.BytesRef; + +/** + * HighFrequencyDictionary: terms taken from the given field + * of a Lucene index, which appear in a number of documents + * above a given threshold. + * + * Threshold is a value in [0..1] representing the minimum + * number of documents (of the total) where a term should appear. + * + * Based on LuceneDictionary. + */ +public class HighFrequencyDictionary implements Dictionary { + private IndexReader reader; + private String field; + private float thresh; + + public HighFrequencyDictionary(IndexReader reader, String field, float thresh) { + this.reader = reader; + this.field = StringHelper.intern(field); + this.thresh = thresh; + } + + public final Iterator getWordsIterator() { + return new HighFrequencyIterator(); + } + + final class HighFrequencyIterator implements TermFreqIterator, SortedIterator { + private TermsEnum termsEnum; + private BytesRef actualTerm; + private boolean hasNextCalled; + private int minNumDocs; + + HighFrequencyIterator() { + try { + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + termsEnum = terms.iterator(); + } + minNumDocs = (int)(thresh * (float)reader.numDocs()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private boolean isFrequent(int freq) { + return freq >= minNumDocs; + } + + public float freq() { + try { + return termsEnum.docFreq(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + + public String next() { + if (!hasNextCalled && !hasNext()) { + return null; + } + hasNextCalled = false; + + return (actualTerm != null) ? actualTerm.utf8ToString() : null; + } + + public boolean hasNext() { + if (hasNextCalled) { + return actualTerm != null; + } + hasNextCalled = true; + + if (termsEnum == null) { + return false; + } + + while(true) { + + try { + actualTerm = termsEnum.next(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + // if there are no words return false + if (actualTerm == null) { + return false; + } + + // got a valid term, does it pass the threshold? + try { + if (isFrequent(termsEnum.docFreq())) { + return true; + } + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,112 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; + +public class JaroWinklerDistance implements StringDistance { + + private float threshold = 0.7f; + + private int[] matches(String s1, String s2) { + String max, min; + if (s1.length() > s2.length()) { + max = s1; + min = s2; + } else { + max = s2; + min = s1; + } + int range = Math.max(max.length() / 2 - 1, 0); + int[] matchIndexes = new int[min.length()]; + Arrays.fill(matchIndexes, -1); + boolean[] matchFlags = new boolean[max.length()]; + int matches = 0; + for (int mi = 0; mi < min.length(); mi++) { + char c1 = min.charAt(mi); + for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max + .length()); xi < xn; xi++) { + if (!matchFlags[xi] && c1 == max.charAt(xi)) { + matchIndexes[mi] = xi; + matchFlags[xi] = true; + matches++; + break; + } + } + } + char[] ms1 = new char[matches]; + char[] ms2 = new char[matches]; + for (int i = 0, si = 0; i < min.length(); i++) { + if (matchIndexes[i] != -1) { + ms1[si] = min.charAt(i); + si++; + } + } + for (int i = 0, si = 0; i < max.length(); i++) { + if (matchFlags[i]) { + ms2[si] = max.charAt(i); + si++; + } + } + int transpositions = 0; + for (int mi = 0; mi < ms1.length; mi++) { + if (ms1[mi] != ms2[mi]) { + transpositions++; + } + } + int prefix = 0; + for (int mi = 0; mi < min.length(); mi++) { + if (s1.charAt(mi) == s2.charAt(mi)) { + prefix++; + } else { + break; + } + } + return new int[] { matches, transpositions / 2, prefix, max.length() }; + } + + public float getDistance(String s1, String s2) { + int[] mtp = matches(s1, s2); + float m = mtp[0]; + if (m == 0) { + return 0f; + } + float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3; + float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2] + * (1 - j); + return jw; + } + + /** + * Sets the threshold used to determine when Winkler bonus should be used. + * Set to a negative value to get the Jaro distance. + * @param threshold the new value of the threshold + */ + public void setThreshold(float threshold) { + this.threshold = threshold; + } + + /** + * Returns the current value of the threshold used for adding the Winkler bonus. + * The default value is 0.7. + * @return the current value of the threshold + */ + public float getThreshold() { + return threshold; + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,109 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Levenstein edit distance class. + */ +public final class LevensteinDistance implements StringDistance { + + /** + * Optimized to run a bit faster than the static getDistance(). + * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster. + */ + public LevensteinDistance () { + } + + + //***************************** + // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String) + //***************************** + public float getDistance (String target, String other) { + char[] sa; + int n; + int p[]; //'previous' cost array, horizontally + int d[]; // cost array, horizontally + int _d[]; //placeholder to assist in swapping p and d + + /* + The difference between this impl. and the previous is that, rather + than creating and retaining a matrix of size s.length()+1 by t.length()+1, + we maintain two single-dimensional arrays of length s.length()+1. The first, d, + is the 'current working' distance array that maintains the newest distance cost + counts as we iterate through the characters of String s. Each time we increment + the index of String t we are comparing, d is copied to p, the second int[]. Doing so + allows us to retain the previous cost counts as required by the algorithm (taking + the minimum of the cost count to the left, up one, and diagonally up and to the left + of the current cost count being calculated). (Note that the arrays aren't really + copied anymore, just switched...this is clearly much better than cloning an array + or doing a System.arraycopy() each time through the outer loop.) + + Effectively, the difference between the two implementations is this one does not + cause an out of memory condition when calculating the LD over two very large strings. + */ + + sa = target.toCharArray(); + n = sa.length; + p = new int[n+1]; + d = new int[n+1]; + + final int m = other.length(); + if (n == 0 || m == 0) { + if (n == m) { + return 1; + } + else { + return 0; + } + } + + + // indexes into strings s and t + int i; // iterates through s + int j; // iterates through t + + char t_j; // jth character of t + + int cost; // cost + + for (i = 0; i<=n; i++) { + p[i] = i; + } + + for (j = 1; j<=m; j++) { + t_j = other.charAt(j-1); + d[0] = j; + + for (i=1; i<=n; i++) { + cost = sa[i-1]==t_j ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost); + } + + // copy current distance counts to 'previous row' distance counts + _d = p; + p = d; + d = _d; + } + + // our last action in the above loop was to switch d and p, so p now + // actually has the most recent cost counts + return 1.0f - ((float) p[n] / Math.max(other.length(), sa.length)); + } + +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,96 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; + +import java.util.Iterator; + +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.util.StringHelper; + +import java.io.*; + +/** + * Lucene Dictionary: terms taken from the given field + * of a Lucene index. + * + * When using IndexReader.terms(Term) the code must not call next() on TermEnum + * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6 + * + * + * + */ +public class LuceneDictionary implements Dictionary { + private IndexReader reader; + private String field; + + public LuceneDictionary(IndexReader reader, String field) { + this.reader = reader; + this.field = StringHelper.intern(field); + } + + public final Iterator getWordsIterator() { + return new LuceneIterator(); + } + + + final class LuceneIterator implements Iterator { + private TermsEnum termsEnum; + private BytesRef pendingTerm; + + LuceneIterator() { + try { + final Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + termsEnum = terms.iterator(); + pendingTerm = termsEnum.next(); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public String next() { + if (pendingTerm == null) { + return null; + } + + String result = pendingTerm.utf8ToString(); + + try { + pendingTerm = termsEnum.next(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + return result; + } + + public boolean hasNext() { + return pendingTerm != null; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/NGramDistance.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/NGramDistance.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/NGramDistance.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/NGramDistance.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,144 @@ +package org.apache.lucene.search.spell; + +/** +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +/** + * N-Gram version of edit distance based on paper by Grzegorz Kondrak, + * "N-gram similarity and distance". Proceedings of the Twelfth International + * Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126, + * Buenos Aires, Argentina, November 2005. + * http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf + * + * This implementation uses the position-based optimization to compute partial + * matches of n-gram sub-strings and adds a null-character prefix of size n-1 + * so that the first character is contained in the same number of n-grams as + * a middle character. Null-character prefix matches are discounted so that + * strings with no matching characters will return a distance of 0. + * + */ +public class NGramDistance implements StringDistance { + + private int n; + + /** + * Creates an N-Gram distance measure using n-grams of the specified size. + * @param size The size of the n-gram to be used to compute the string distance. + */ + public NGramDistance(int size) { + this.n = size; + } + + /** + * Creates an N-Gram distance measure using n-grams of size 2. + */ + public NGramDistance() { + this(2); + } + + public float getDistance(String source, String target) { + final int sl = source.length(); + final int tl = target.length(); + + if (sl == 0 || tl == 0) { + if (sl == tl) { + return 1; + } + else { + return 0; + } + } + + int cost = 0; + if (sl < n || tl < n) { + for (int i=0,ni=Math.min(sl,tl);iFormat allowed: 1 word per line:
+ * word1
+ * word2
+ * word3
+ */ +public class PlainTextDictionary implements Dictionary { + + private BufferedReader in; + private String line; + private boolean hasNextCalled; + + public PlainTextDictionary(File file) throws FileNotFoundException { + in = new BufferedReader(new FileReader(file)); + } + + public PlainTextDictionary(InputStream dictFile) { + in = new BufferedReader(new InputStreamReader(dictFile)); + } + + /** + * Creates a dictionary based on a reader. + */ + public PlainTextDictionary(Reader reader) { + in = new BufferedReader(reader); + } + + public Iterator getWordsIterator() { + return new fileIterator(); + } + + final class fileIterator implements Iterator { + public String next() { + if (!hasNextCalled) { + hasNext(); + } + hasNextCalled = false; + return line; + } + + public boolean hasNext() { + hasNextCalled = true; + try { + line = in.readLine(); + } catch (IOException ex) { + throw new RuntimeException(ex); + } + return (line != null) ? true : false; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java 2011-05-22 16:52:15.000000000 -0400 @@ -0,0 +1,11 @@ +package org.apache.lucene.search.spell; + +import java.util.Iterator; + +/** + * Marker interface to signal that elements coming from {@link Iterator} + * come in ascending lexicographic order. + */ +public interface SortedIterator { + +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,724 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.TieredMergePolicy; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ReaderUtil; +import org.apache.lucene.util.Version; + +/** + *

+ * Spell Checker class (Main class)
+ * (initially inspired by the David Spencer code). + *

+ * + *

Example Usage: + * + *

+ *  SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
+ *  // To index a field of a user index:
+ *  spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
+ *  // To index a file containing words:
+ *  spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
+ *  String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
+ * 
+ * + * + * @version 1.0 + */ +public class SpellChecker implements java.io.Closeable { + + /** + * The default minimum score to use, if not specified by calling {@link #setAccuracy(float)} . + */ + public static final float DEFAULT_ACCURACY = 0.5f; + + /** + * Field name for each word in the ngram index. + */ + public static final String F_WORD = "word"; + + private static final Term F_WORD_TERM = new Term(F_WORD); + + /** + * the spell index + */ + // don't modify the directory directly - see #swapSearcher() + // TODO: why is this package private? + Directory spellIndex; + /** + * Boost value for start and end grams + */ + private float bStart = 2.0f; + + private float bEnd = 1.0f; + // don't use this searcher directly - see #swapSearcher() + + private IndexSearcher searcher; + /* + * this locks all modifications to the current searcher. + */ + + private final Object searcherLock = new Object(); + /* + * this lock synchronizes all possible modifications to the + * current index directory. It should not be possible to try modifying + * the same index concurrently. Note: Do not acquire the searcher lock + * before acquiring this lock! + */ + private final Object modifyCurrentIndexLock = new Object(); + + private volatile boolean closed = false; + // minimum score for hits generated by the spell checker query + + private float accuracy = DEFAULT_ACCURACY; + + private StringDistance sd; + private Comparator comparator; + + /** + * Use the given directory as a spell checker index. The directory + * is created if it doesn't exist yet. + * @param spellIndex the spell index directory + * @param sd the {@link StringDistance} measurement to use + * @throws IOException if Spellchecker can not open the directory + */ + public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException { + this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR); + } + /** + * Use the given directory as a spell checker index with a + * {@link LevensteinDistance} as the default {@link StringDistance}. The + * directory is created if it doesn't exist yet. + * + * @param spellIndex + * the spell index directory + * @throws IOException + * if spellchecker can not open the directory + */ + public SpellChecker(Directory spellIndex) throws IOException { + this(spellIndex, new LevensteinDistance()); + } + + /** + * Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure + * and the given {@link java.util.Comparator} for sorting the results. + * @param spellIndex The spelling index + * @param sd The distance + * @param comparator The comparator + * @throws IOException if there is a problem opening the index + */ + public SpellChecker(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException { + setSpellIndex(spellIndex); + setStringDistance(sd); + this.comparator = comparator; + } + + /** + * Use a different index as the spell checker index or re-open + * the existing index if spellIndex is the same value + * as given in the constructor. + * @param spellIndexDir the spell directory to use + * @throws AlreadyClosedException if the Spellchecker is already closed + * @throws IOException if spellchecker can not open the directory + */ + // TODO: we should make this final as it is called in the constructor + public void setSpellIndex(Directory spellIndexDir) throws IOException { + // this could be the same directory as the current spellIndex + // modifications to the directory should be synchronized + synchronized (modifyCurrentIndexLock) { + ensureOpen(); + if (!IndexReader.indexExists(spellIndexDir)) { + IndexWriter writer = new IndexWriter(spellIndexDir, + new IndexWriterConfig(Version.LUCENE_CURRENT, + new WhitespaceAnalyzer(Version.LUCENE_CURRENT))); + writer.close(); + } + swapSearcher(spellIndexDir); + } + } + + /** + * Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}. + * @param comparator the comparator + */ + public void setComparator(Comparator comparator) { + this.comparator = comparator; + } + + public Comparator getComparator() { + return comparator; + } + + /** + * Sets the {@link StringDistance} implementation for this + * {@link SpellChecker} instance. + * + * @param sd the {@link StringDistance} implementation for this + * {@link SpellChecker} instance + */ + public void setStringDistance(StringDistance sd) { + this.sd = sd; + } + /** + * Returns the {@link StringDistance} instance used by this + * {@link SpellChecker} instance. + * + * @return the {@link StringDistance} instance used by this + * {@link SpellChecker} instance. + */ + public StringDistance getStringDistance() { + return sd; + } + + /** + * Sets the accuracy 0 < minScore < 1; default {@link #DEFAULT_ACCURACY} + * @param acc The new accuracy + */ + public void setAccuracy(float acc) { + this.accuracy = acc; + } + + /** + * The accuracy (minimum score) to be used, unless overridden in {@link #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)}, to + * decide whether a suggestion is included or not. + * @return The current accuracy setting + */ + public float getAccuracy() { + return accuracy; + } + + /** + * Suggest similar words. + * + *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms + * is not the same as the edit distance strategy used to calculate the best + * matching spell-checked word from the hits that Lucene found, one usually has + * to retrieve a couple of numSug's in order to get the true best match. + * + *

I.e. if numSug == 1, don't count on that suggestion being the best one. + * Thus, you should set this value to at least 5 for a good suggestion. + * + * @param word the word you want a spell check done on + * @param numSug the number of suggested words + * @throws IOException if the underlying index throws an {@link IOException} + * @throws AlreadyClosedException if the Spellchecker is already closed + * @return String[] + * + * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) + */ + public String[] suggestSimilar(String word, int numSug) throws IOException { + return this.suggestSimilar(word, numSug, null, null, false); + } + + /** + * Suggest similar words. + * + *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms + * is not the same as the edit distance strategy used to calculate the best + * matching spell-checked word from the hits that Lucene found, one usually has + * to retrieve a couple of numSug's in order to get the true best match. + * + *

I.e. if numSug == 1, don't count on that suggestion being the best one. + * Thus, you should set this value to at least 5 for a good suggestion. + * + * @param word the word you want a spell check done on + * @param numSug the number of suggested words + * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results + * @throws IOException if the underlying index throws an {@link IOException} + * @throws AlreadyClosedException if the Spellchecker is already closed + * @return String[] + * + * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) + */ + public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException { + return this.suggestSimilar(word, numSug, null, null, false, accuracy); + } + + /** + * Suggest similar words (optionally restricted to a field of an index). + * + *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms + * is not the same as the edit distance strategy used to calculate the best + * matching spell-checked word from the hits that Lucene found, one usually has + * to retrieve a couple of numSug's in order to get the true best match. + * + *

I.e. if numSug == 1, don't count on that suggestion being the best one. + * Thus, you should set this value to at least 5 for a good suggestion. + * + *

Uses the {@link #getAccuracy()} value passed into the constructor as the accuracy. + * + * @param word the word you want a spell check done on + * @param numSug the number of suggested words + * @param ir the indexReader of the user index (can be null see field param) + * @param field the field of the user index: if field is not null, the suggested + * words are restricted to the words present in this field. + * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word + * (only if restricted mode = (indexReader!=null and field!=null) + * @throws IOException if the underlying index throws an {@link IOException} + * @throws AlreadyClosedException if the Spellchecker is already closed + * @return String[] the sorted list of the suggest words with these 2 criteria: + * first criteria: the edit distance, second criteria (only if restricted mode): the popularity + * of the suggest words in the field of the user index + * + * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) + */ + public String[] suggestSimilar(String word, int numSug, IndexReader ir, + String field, boolean morePopular) throws IOException { + return suggestSimilar(word, numSug, ir, field, morePopular, accuracy); + } + + + /** + * Suggest similar words (optionally restricted to a field of an index). + * + *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms + * is not the same as the edit distance strategy used to calculate the best + * matching spell-checked word from the hits that Lucene found, one usually has + * to retrieve a couple of numSug's in order to get the true best match. + * + *

I.e. if numSug == 1, don't count on that suggestion being the best one. + * Thus, you should set this value to at least 5 for a good suggestion. + * + * @param word the word you want a spell check done on + * @param numSug the number of suggested words + * @param ir the indexReader of the user index (can be null see field param) + * @param field the field of the user index: if field is not null, the suggested + * words are restricted to the words present in this field. + * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word + * (only if restricted mode = (indexReader!=null and field!=null) + * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results + * @throws IOException if the underlying index throws an {@link IOException} + * @throws AlreadyClosedException if the Spellchecker is already closed + * @return String[] the sorted list of the suggest words with these 2 criteria: + * first criteria: the edit distance, second criteria (only if restricted mode): the popularity + * of the suggest words in the field of the user index + */ + public String[] suggestSimilar(String word, int numSug, IndexReader ir, + String field, boolean morePopular, float accuracy) throws IOException { + // obtainSearcher calls ensureOpen + final IndexSearcher indexSearcher = obtainSearcher(); + try{ + + final int lengthWord = word.length(); + + final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0; + final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0; + // if the word exists in the real index and we don't care for word frequency, return the word itself + if (!morePopular && freq > 0) { + return new String[] { word }; + } + + BooleanQuery query = new BooleanQuery(); + String[] grams; + String key; + + for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) { + + key = "gram" + ng; // form key + + grams = formGrams(word, ng); // form word into ngrams (allow dups too) + + if (grams.length == 0) { + continue; // hmm + } + + if (bStart > 0) { // should we boost prefixes? + add(query, "start" + ng, grams[0], bStart); // matches start of word + + } + if (bEnd > 0) { // should we boost suffixes + add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word + + } + for (int i = 0; i < grams.length; i++) { + add(query, key, grams[i]); + } + } + + int maxHits = 10 * numSug; + + // System.out.println("Q: " + query); + ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs; + // System.out.println("HITS: " + hits.length()); + SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator); + + // go thru more than 'maxr' matches in case the distance filter triggers + int stop = Math.min(hits.length, maxHits); + SuggestWord sugWord = new SuggestWord(); + for (int i = 0; i < stop; i++) { + + sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word + + // don't suggest a word for itself, that would be silly + if (sugWord.string.equals(word)) { + continue; + } + + // edit distance + sugWord.score = sd.getDistance(word,sugWord.string); + if (sugWord.score < accuracy) { + continue; + } + + if (ir != null && field != null) { // use the user index + sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index + // don't suggest a word that is not present in the field + if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) { + continue; + } + } + sugQueue.insertWithOverflow(sugWord); + if (sugQueue.size() == numSug) { + // if queue full, maintain the minScore score + accuracy = sugQueue.top().score; + } + sugWord = new SuggestWord(); + } + + // convert to array string + String[] list = new String[sugQueue.size()]; + for (int i = sugQueue.size() - 1; i >= 0; i--) { + list[i] = sugQueue.pop().string; + } + + return list; + } finally { + releaseSearcher(indexSearcher); + } + } + /** + * Add a clause to a boolean query. + */ + private static void add(BooleanQuery q, String name, String value, float boost) { + Query tq = new TermQuery(new Term(name, value)); + tq.setBoost(boost); + q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD)); + } + + /** + * Add a clause to a boolean query. + */ + private static void add(BooleanQuery q, String name, String value) { + q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD)); + } + + /** + * Form all ngrams for a given word. + * @param text the word to parse + * @param ng the ngram length e.g. 3 + * @return an array of all ngrams in the word and note that duplicates are not removed + */ + private static String[] formGrams(String text, int ng) { + int len = text.length(); + String[] res = new String[len - ng + 1]; + for (int i = 0; i < len - ng + 1; i++) { + res[i] = text.substring(i, i + ng); + } + return res; + } + + /** + * Removes all terms from the spell check index. + * @throws IOException + * @throws AlreadyClosedException if the Spellchecker is already closed + */ + public void clearIndex() throws IOException { + synchronized (modifyCurrentIndexLock) { + ensureOpen(); + final Directory dir = this.spellIndex; + final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( + Version.LUCENE_CURRENT, + new WhitespaceAnalyzer(Version.LUCENE_CURRENT)) + .setOpenMode(OpenMode.CREATE)); + writer.close(); + swapSearcher(dir); + } + } + + /** + * Check whether the word exists in the index. + * @param word + * @throws IOException + * @throws AlreadyClosedException if the Spellchecker is already closed + * @return true if the word exists in the index + */ + public boolean exist(String word) throws IOException { + // obtainSearcher calls ensureOpen + final IndexSearcher indexSearcher = obtainSearcher(); + try{ + return indexSearcher.docFreq(F_WORD_TERM.createTerm(word)) > 0; + } finally { + releaseSearcher(indexSearcher); + } + } + + /** + * Indexes the data from the given {@link Dictionary}. + * @param dict Dictionary to index + * @param mergeFactor mergeFactor to use when indexing + * @param ramMB the max amount or memory in MB to use + * @param optimize whether or not the spellcheck index should be optimized + * @throws AlreadyClosedException if the Spellchecker is already closed + * @throws IOException + */ + public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB, boolean optimize) throws IOException { + synchronized (modifyCurrentIndexLock) { + ensureOpen(); + final Directory dir = this.spellIndex; + final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)).setRAMBufferSizeMB(ramMB)); + ((TieredMergePolicy) writer.getConfig().getMergePolicy()).setMaxMergeAtOnce(mergeFactor); + IndexSearcher indexSearcher = obtainSearcher(); + final List termsEnums = new ArrayList(); + + if (searcher.maxDoc() > 0) { + new ReaderUtil.Gather(searcher.getIndexReader()) { + @Override + protected void add(int base, IndexReader r) throws IOException { + Terms terms = r.terms(F_WORD); + if (terms != null) + termsEnums.add(terms.iterator()); + } + }.run(); + } + + boolean isEmpty = termsEnums.isEmpty(); + + try { + Iterator iter = dict.getWordsIterator(); + BytesRef currentTerm = new BytesRef(); + + terms: while (iter.hasNext()) { + String word = iter.next(); + + int len = word.length(); + if (len < 3) { + continue; // too short we bail but "too long" is fine... + } + + if (!isEmpty) { + // we have a non-empty index, check if the term exists + currentTerm.copy(word); + for (TermsEnum te : termsEnums) { + if (te.seek(currentTerm, false) == TermsEnum.SeekStatus.FOUND) { + continue terms; + } + } + } + + // ok index the word + Document doc = createDocument(word, getMin(len), getMax(len)); + writer.addDocument(doc); + } + } finally { + releaseSearcher(indexSearcher); + } + // close writer + if (optimize) + writer.optimize(); + writer.close(); + // also re-open the spell index to see our own changes when the next suggestion + // is fetched: + swapSearcher(dir); + } + } + + /** + * Indexes the data from the given {@link Dictionary}. + * @param dict the dictionary to index + * @param mergeFactor mergeFactor to use when indexing + * @param ramMB the max amount or memory in MB to use + * @throws IOException + */ + public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException { + indexDictionary(dict, mergeFactor, ramMB, true); + } + + /** + * Indexes the data from the given {@link Dictionary}. + * @param dict the dictionary to index + * @throws IOException + */ + public final void indexDictionary(Dictionary dict) throws IOException { + indexDictionary(dict, 300, (int)IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB); + } + + private static int getMin(int l) { + if (l > 5) { + return 3; + } + if (l == 5) { + return 2; + } + return 1; + } + + private static int getMax(int l) { + if (l > 5) { + return 4; + } + if (l == 5) { + return 3; + } + return 2; + } + + private static Document createDocument(String text, int ng1, int ng2) { + Document doc = new Document(); + // the word field is never queried on... its indexed so it can be quickly + // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos + Field f = new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED); + f.setOmitTermFreqAndPositions(true); + f.setOmitNorms(true); + doc.add(f); // orig term + addGram(text, doc, ng1, ng2); + return doc; + } + + private static void addGram(String text, Document doc, int ng1, int ng2) { + int len = text.length(); + for (int ng = ng1; ng <= ng2; ng++) { + String key = "gram" + ng; + String end = null; + for (int i = 0; i < len - ng + 1; i++) { + String gram = text.substring(i, i + ng); + doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED)); + if (i == 0) { + // only one term possible in the startXXField, TF/pos and norms aren't needed. + Field startField = new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED); + startField.setOmitTermFreqAndPositions(true); + startField.setOmitNorms(true); + doc.add(startField); + } + end = gram; + } + if (end != null) { // may not be present if len==ng1 + // only one term possible in the endXXField, TF/pos and norms aren't needed. + Field endField = new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED); + endField.setOmitTermFreqAndPositions(true); + endField.setOmitNorms(true); + doc.add(endField); + } + } + } + + private IndexSearcher obtainSearcher() { + synchronized (searcherLock) { + ensureOpen(); + searcher.getIndexReader().incRef(); + return searcher; + } + } + + private void releaseSearcher(final IndexSearcher aSearcher) throws IOException{ + // don't check if open - always decRef + // don't decrement the private searcher - could have been swapped + aSearcher.getIndexReader().decRef(); + } + + private void ensureOpen() { + if (closed) { + throw new AlreadyClosedException("Spellchecker has been closed"); + } + } + + /** + * Close the IndexSearcher used by this SpellChecker + * @throws IOException if the close operation causes an {@link IOException} + * @throws AlreadyClosedException if the {@link SpellChecker} is already closed + */ + public void close() throws IOException { + synchronized (searcherLock) { + ensureOpen(); + closed = true; + if (searcher != null) { + searcher.close(); + } + searcher = null; + } + } + + private void swapSearcher(final Directory dir) throws IOException { + /* + * opening a searcher is possibly very expensive. + * We rather close it again if the Spellchecker was closed during + * this operation than block access to the current searcher while opening. + */ + final IndexSearcher indexSearcher = createSearcher(dir); + synchronized (searcherLock) { + if(closed){ + indexSearcher.close(); + throw new AlreadyClosedException("Spellchecker has been closed"); + } + if (searcher != null) { + searcher.close(); + } + // set the spellindex in the sync block - ensure consistency. + searcher = indexSearcher; + this.spellIndex = dir; + } + } + + /** + * Creates a new read-only IndexSearcher + * @param dir the directory used to open the searcher + * @return a new read-only IndexSearcher + * @throws IOException f there is a low-level IO error + */ + // for testing purposes + IndexSearcher createSearcher(final Directory dir) throws IOException{ + return new IndexSearcher(dir, true); + } + + /** + * Returns true if and only if the {@link SpellChecker} is + * closed, otherwise false. + * + * @return true if and only if the {@link SpellChecker} is + * closed, otherwise false. + */ + boolean isClosed(){ + return closed; + } + +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/StringDistance.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/StringDistance.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/StringDistance.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/StringDistance.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,35 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Interface for string distances. + */ +public interface StringDistance { + + /** + * Returns a float between 0 and 1 based on how similar the specified strings are to one another. + * Returning a value of 1 means the specified strings are identical and 0 means the + * string are maximally different. + * @param s1 The first string. + * @param s2 The second string. + * @return a float between 0 and 1 based on how similar the specified strings are to one another. + */ + public float getDistance(String s1,String s2); + +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWord.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWord.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWord.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWord.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,45 @@ +package org.apache.lucene.search.spell; + + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * SuggestWord, used in suggestSimilar method in SpellChecker class. + *

+ * Default sort is first by score, then by frequency. + * + * + */ +public final class SuggestWord{ + + /** + * the score of the word + */ + public float score; + + /** + * The freq of the word + */ + public int freq; + + /** + * the suggested word + */ + public String string; + +} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,47 @@ +package org.apache.lucene.search.spell; + +import java.util.Comparator; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Frequency first, then score. Must have + * + **/ +public class SuggestWordFrequencyComparator implements Comparator { + + public int compare(SuggestWord first, SuggestWord second) { + // first criteria: the frequency + if (first.freq > second.freq) { + return 1; + } + if (first.freq < second.freq) { + return -1; + } + + // second criteria (if first criteria is equal): the score + if (first.score > second.score) { + return 1; + } + if (first.score < second.score) { + return -1; + } + // third criteria: term text + return second.string.compareTo(first.string); + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,63 @@ +package org.apache.lucene.search.spell; + + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.PriorityQueue; + +import java.util.Comparator; + + +/** + * Sorts SuggestWord instances + * + * @see org.apache.lucene.search.spell.SuggestWordScoreComparator + * @see org.apache.lucene.search.spell.SuggestWordFrequencyComparator + * + */ +public final class SuggestWordQueue extends PriorityQueue { + public static final Comparator DEFAULT_COMPARATOR = new SuggestWordScoreComparator(); + + + private Comparator comparator; + + /** + * Use the {@link #DEFAULT_COMPARATOR} + * @param size The size of the queue + */ + public SuggestWordQueue (int size) { + super(size); + comparator = DEFAULT_COMPARATOR; + } + + /** + * Specify the size of the queue and the comparator to use for sorting. + * @param size The size + * @param comparator The comparator. + */ + public SuggestWordQueue(int size, Comparator comparator){ + super(size); + this.comparator = comparator; + } + + @Override + protected final boolean lessThan (SuggestWord wa, SuggestWord wb) { + int val = comparator.compare(wa, wb); + return val < 0; + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,47 @@ +package org.apache.lucene.search.spell; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Comparator; + + +/** + * Score first, then frequency + * + **/ +public class SuggestWordScoreComparator implements Comparator { + public int compare(SuggestWord first, SuggestWord second) { + // first criteria: the distance + if (first.score > second.score) { + return 1; + } + if (first.score < second.score) { + return -1; + } + + // second criteria (if first criteria is equal): the popularity + if (first.freq > second.freq) { + return 1; + } + + if (first.freq < second.freq) { + return -1; + } + // third criteria: term text + return second.string.compareTo(first.string); + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java 2011-05-22 19:00:30.000000000 -0400 @@ -0,0 +1,33 @@ +package org.apache.lucene.search.spell; + +import java.util.Iterator; + +public interface TermFreqIterator extends Iterator { + + public float freq(); + + public static class TermFreqIteratorWrapper implements TermFreqIterator { + private Iterator wrapped; + + public TermFreqIteratorWrapper(Iterator wrapped) { + this.wrapped = wrapped; + } + + public float freq() { + return 1.0f; + } + + public boolean hasNext() { + return wrapped.hasNext(); + } + + public String next() { + return wrapped.next().toString(); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java 2011-05-22 17:03:03.000000000 -0400 @@ -0,0 +1,65 @@ +package org.apache.lucene.search.suggest; + + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.search.spell.TermFreqIterator; + +/** + * This wrapper buffers incoming elements. + */ +public class BufferingTermFreqIteratorWrapper implements TermFreqIterator { + + /** Entry in the buffer. */ + public static final class Entry implements Comparable { + String word; + float freq; + + public Entry(String word, float freq) { + this.word = word; + this.freq = freq; + } + + public int compareTo(Entry o) { + return word.compareTo(o.word); + } + } + + protected ArrayList entries = new ArrayList(); + + protected int curPos; + protected Entry curEntry; + + public BufferingTermFreqIteratorWrapper(TermFreqIterator source) { + // read all source data into buffer + while (source.hasNext()) { + String w = source.next(); + Entry e = new Entry(w, source.freq()); + entries.add(e); + } + curPos = 0; + } + + public float freq() { + return curEntry.freq; + } + + public boolean hasNext() { + return curPos < entries.size(); + } + + public String next() { + curEntry = entries.get(curPos); + curPos++; + return curEntry.word; + } + + public void remove() { + throw new UnsupportedOperationException("remove is not supported"); + } + + public List entries() { + return entries; + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java 2011-05-22 17:03:10.000000000 -0400 @@ -0,0 +1,95 @@ +package org.apache.lucene.search.suggest; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.*; + +import org.apache.lucene.search.spell.Dictionary; +import org.apache.lucene.search.spell.TermFreqIterator; + + +/** + * Dictionary represented by a text file. + * + *

Format allowed: 1 string per line, optionally with a tab-separated integer value:
+ * word1 TAB 100
+ * word2 word3 TAB 101
+ * word4 word5 TAB 102
+ */ +public class FileDictionary implements Dictionary { + + private BufferedReader in; + private String line; + private boolean hasNextCalled; + + public FileDictionary(InputStream dictFile) { + in = new BufferedReader(new InputStreamReader(dictFile)); + } + + /** + * Creates a dictionary based on a reader. + */ + public FileDictionary(Reader reader) { + in = new BufferedReader(reader); + } + + public TermFreqIterator getWordsIterator() { + return new fileIterator(); + } + + final class fileIterator implements TermFreqIterator { + private float curFreq; + + public String next() { + if (!hasNextCalled) { + hasNext(); + } + hasNextCalled = false; + return line; + } + + public float freq() { + return curFreq; + } + + public boolean hasNext() { + hasNextCalled = true; + try { + line = in.readLine(); + if (line != null) { + String[] fields = line.split("\t"); + if (fields.length > 1) { + curFreq = Float.parseFloat(fields[1]); + line = fields[0]; + } else { + curFreq = 1; + } + } + } catch (IOException ex) { + throw new RuntimeException(ex); + } + return (line != null) ? true : false; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java 2011-05-22 18:18:14.000000000 -0400 @@ -0,0 +1,117 @@ +package org.apache.lucene.search.suggest; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.search.spell.Dictionary; +import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.PriorityQueue; + +public abstract class Lookup { + /** + * Result of a lookup. + */ + public static final class LookupResult implements Comparable { + public final String key; + public final float value; + + public LookupResult(String key, float value) { + this.key = key; + this.value = value; + } + + @Override + public String toString() { + return key + "/" + value; + } + + /** Compare alphabetically. */ + public int compareTo(LookupResult o) { + return this.key.compareTo(o.key); + } + } + + public static final class LookupPriorityQueue extends PriorityQueue { + + public LookupPriorityQueue(int size) { + super(size); + } + + @Override + protected boolean lessThan(LookupResult a, LookupResult b) { + return a.value < b.value; + } + + public LookupResult[] getResults() { + int size = size(); + LookupResult[] res = new LookupResult[size]; + for (int i = size - 1; i >= 0; i--) { + res[i] = pop(); + } + return res; + } + } + + /** Build lookup from a dictionary. Some implementations may require sorted + * or unsorted keys from the dictionary's iterator - use + * {@link SortedTermFreqIteratorWrapper} or + * {@link UnsortedTermFreqIteratorWrapper} in such case. + */ + public void build(Dictionary dict) throws IOException { + Iterator it = dict.getWordsIterator(); + TermFreqIterator tfit; + if (it instanceof TermFreqIterator) { + tfit = (TermFreqIterator)it; + } else { + tfit = new TermFreqIterator.TermFreqIteratorWrapper(it); + } + build(tfit); + } + + public abstract void build(TermFreqIterator tfit) throws IOException; + + /** + * Persist the constructed lookup data to a directory. Optional operation. + * @param storeDir directory where data can be stored. + * @return true if successful, false if unsuccessful or not supported. + * @throws IOException when fatal IO error occurs. + */ + public abstract boolean store(File storeDir) throws IOException; + + /** + * Discard current lookup data and load it from a previously saved copy. + * Optional operation. + * @param storeDir directory where lookup data was stored. + * @return true if completed successfully, false if unsuccessful or not supported. + * @throws IOException when fatal IO error occurs. + */ + public abstract boolean load(File storeDir) throws IOException; + + /** + * Look up a key and return possible completion for this key. + * @param key lookup key. Depending on the implementation this may be + * a prefix, misspelling, or even infix. + * @param onlyMorePopular return only more popular results + * @param num maximum number of results to return + * @return a list of possible completions, with their relative weight (e.g. popularity) + */ + public abstract List lookup(String key, boolean onlyMorePopular, int num); + + /** + * Modify the lookup data by recording additional data. Optional operation. + * @param key new lookup key + * @param value value to associate with this key + * @return true if new key is added, false if it already exists or operation + * is not supported. + */ + public abstract boolean add(String key, Object value); + + /** + * Get value associated with a specific key. + * @param key lookup key + * @return associated value + */ + public abstract Object get(String key); +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java 2011-05-22 17:03:26.000000000 -0400 @@ -0,0 +1,18 @@ +package org.apache.lucene.search.suggest; + +import java.util.Collections; + +import org.apache.lucene.search.spell.SortedIterator; +import org.apache.lucene.search.spell.TermFreqIterator; + +/** + * This wrapper buffers incoming elements and makes sure they are sorted in + * ascending lexicographic order. + */ +public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper implements SortedIterator { + + public SortedTermFreqIteratorWrapper(TermFreqIterator source) { + super(source); + Collections.sort(entries); + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java 2011-05-22 17:03:33.000000000 -0400 @@ -0,0 +1,17 @@ +package org.apache.lucene.search.suggest; + +import java.util.Collections; + +import org.apache.lucene.search.spell.TermFreqIterator; + +/** + * This wrapper buffers the incoming elements and makes sure they are in + * random order. + */ +public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper { + + public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) { + super(source); + Collections.shuffle(entries); + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java 2011-05-22 18:59:15.000000000 -0400 @@ -0,0 +1,540 @@ +package org.apache.lucene.search.suggest.fst; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.automaton.fst.Builder; +import org.apache.lucene.util.automaton.fst.FST; +import org.apache.lucene.util.automaton.fst.FST.Arc; +import org.apache.lucene.util.automaton.fst.NoOutputs; +import org.apache.lucene.util.automaton.fst.Outputs; + +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.tst.TSTLookup; +import org.apache.lucene.search.spell.TermFreqIterator; + +/** + * Finite state automata based implementation of {@link Lookup} query + * suggestion/ autocomplete interface. + * + *

Implementation details

+ * + *

The construction step in {@link #build(TermFreqIterator)} works as follows: + *

    + *
  • A set of input terms (String) and weights (float) is given.
  • + *
  • The range of weights is determined and then all weights are discretized into a fixed set + * of values ({@link #buckets}). + * Note that this means that minor changes in weights may be lost during automaton construction. + * In general, this is not a big problem because the "priorities" of completions can be split + * into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal). + * If you need exact, fine-grained weights, use {@link TSTLookup} instead.
  • + *
  • All terms in the input are preprended with a synthetic pseudo-character being the weight + * of that term. For example a term abc with a discretized weight equal '1' would + * become 1abc.
  • + *
  • The terms are sorted by their raw value of utf16 character values (including the synthetic + * term in front).
  • + *
  • A finite state automaton ({@link FST}) is constructed from the input. The root node has + * arcs labeled with all possible weights. We cache all these arcs, highest-weight first.
  • + *
+ * + *

At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows: + *

    + *
  • For each possible term weight encoded in the automaton (cached arcs from the root above), + * starting with the highest one, we descend along the path of the input key. If the key is not + * a prefix of a sequence in the automaton (path ends prematurely), we exit immediately. + * No completions. + *
  • Otherwise, we have found an internal automaton node that ends the key. The entire + * subautomaton (all paths) starting from this node form the key's completions. We start + * the traversal of this subautomaton. Every time we reach a final state (arc), we add a single + * suggestion to the list of results (the weight of this suggestion is constant and equal to the + * root path we started from). The tricky part is that because automaton edges are sorted and + * we scan depth-first, we can terminate the entire procedure as soon as we collect enough + * suggestions the user requested. + *
  • In case the number of suggestions collected in the step above is still insufficient, + * we proceed to the next (smaller) weight leaving the root node and repeat the same + * algorithm again. + *
  • + *
+ * + *

Runtime behavior and performance characteristic

+ * + *

The algorithm described above is optimized for finding suggestions to short prefixes + * in a top-weights-first order. This is probably the most common use case: it allows + * presenting suggestions early and sorts them by the global frequency (and then alphabetically). + * + *

If there is an exact match in the automaton, it is returned first on the results + * list (even with by-weight sorting). + * + *

Note that the maximum lookup time for any prefix + * is the time of descending to the subtree, plus traversal of the subtree up to the number + * of requested suggestions (because they are already presorted by weight on the root level + * and alphabetically at any node level). + * + *

To order alphabetically only (no ordering by priorities), use identical term weights + * for all terms. Alphabetical suggestions are returned even if non-constant weights are + * used, but the algorithm for doing this is suboptimal. + * + *

"alphabetically" in any of the documentation above indicates utf16 codepoint order, + * nothing else. + */ +public class FSTLookup extends Lookup { + + public FSTLookup() { + this(10, true); + } + + public FSTLookup(int buckets, boolean exactMatchFirst) { + this.buckets = buckets; + this.exactMatchFirst = exactMatchFirst; + } + + /** A structure for a single entry (for sorting/ preprocessing). */ + private static class Entry { + char [] term; + float weight; + + public Entry(char [] term, float freq) { + this.term = term; + this.weight = freq; + } + } + + /** Serialized automaton file name (storage). */ + public static final String FILENAME = "fst.dat"; + + /** An empty result. */ + private static final List EMPTY_RESULT = Collections.emptyList(); + + /** + * The number of separate buckets for weights (discretization). The more buckets, + * the more fine-grained term weights (priorities) can be assigned. The speed of lookup + * will not decrease for prefixes which have highly-weighted completions (because these + * are filled-in first), but will decrease significantly for low-weighted terms (but + * these should be infrequent, so it is all right). + * + *

The number of buckets must be within [1, 255] range. + */ + private final int buckets; + + /** + * If true, exact suggestions are returned first, even if they are prefixes + * of other strings in the automaton (possibly with larger weights). + */ + private final boolean exactMatchFirst; + + /** + * Finite state automaton encoding all the lookup terms. See class + * notes for details. + */ + private FST automaton; + + /** + * An array of arcs leaving the root automaton state and encoding weights of all + * completions in their sub-trees. + */ + private Arc [] rootArcs; + + /* */ + @Override + public void build(TermFreqIterator tfit) throws IOException { + // Buffer the input because we will need it twice: for calculating + // weights distribution and for the actual automata building. + List entries = new ArrayList(); + while (tfit.hasNext()) { + String term = tfit.next(); + char [] termChars = new char [term.length() + 1]; // add padding for weight. + for (int i = 0; i < term.length(); i++) + termChars[i + 1] = term.charAt(i); + entries.add(new Entry(termChars, tfit.freq())); + } + + // Distribute weights into at most N buckets. This is a form of discretization to + // limit the number of possible weights so that they can be efficiently encoded in the + // automaton. + // + // It is assumed the distribution of weights is _linear_ so proportional division + // of [min, max] range will be enough here. Other approaches could be to sort + // weights and divide into proportional ranges. + if (entries.size() > 0) { + redistributeWeightsProportionalMinMax(entries, buckets); + encodeWeightPrefix(entries); + } + + // Build the automaton (includes input sorting) and cache root arcs in order from the highest, + // to the lowest weight. + this.automaton = buildAutomaton(entries); + cacheRootArcs(); + } + + /** + * Cache the root node's output arcs starting with completions with the highest weights. + */ + @SuppressWarnings("unchecked") + private void cacheRootArcs() throws IOException { + if (automaton != null) { + List> rootArcs = new ArrayList>(); + Arc arc = automaton.getFirstArc(new Arc()); + automaton.readFirstTargetArc(arc, arc); + while (true) { + rootArcs.add(new Arc().copyFrom(arc)); + if (arc.isLast()) + break; + automaton.readNextArc(arc); + } + + Collections.reverse(rootArcs); // we want highest weights first. + this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]); + } + } + + /** + * Not implemented. + */ + @Override + public boolean add(String key, Object value) { + // This implementation does not support ad-hoc additions (all input + // must be sorted for the builder). + return false; + } + + /** + * Get the (approximated) weight of a single key (if there is a perfect match + * for it in the automaton). + * + * @return Returns the approximated weight of the input key or null + * if not found. + */ + @Override + public Float get(String key) { + return getExactMatchStartingFromRootArc(0, key); + } + + /** + * Returns the first exact match by traversing root arcs, starting from + * the arc i. + * + * @param i The first root arc index in {@link #rootArcs} to consider when + * matching. + */ + private Float getExactMatchStartingFromRootArc(int i, String key) { + // Get the UTF-8 bytes representation of the input key. + try { + final FST.Arc scratch = new FST.Arc(); + for (; i < rootArcs.length; i++) { + final FST.Arc rootArc = rootArcs[i]; + final FST.Arc arc = scratch.copyFrom(rootArc); + + // Descend into the automaton using the key as prefix. + if (descendWithPrefix(arc, key)) { + automaton.readFirstTargetArc(arc, arc); + if (arc.label == FST.END_LABEL) { + // Prefix-encoded weight. + return rootArc.label / (float) buckets; + } + } + } + } catch (IOException e) { + // Should never happen, but anyway. + throw new RuntimeException(e); + } + + return null; + } + + /** + * Lookup autocomplete suggestions to key. + * + * @param key The prefix to which suggestions should be sought. + * @param onlyMorePopular Return most popular suggestions first. This is the default + * behavior for this implementation. Setting it to false has no effect (use + * constant term weights to sort alphabetically only). + * @param num At most this number of suggestions will be returned. + * @return Returns the suggestions, sorted by their approximated weight first (decreasing) + * and then alphabetically (utf16 codepoint order). + */ + @Override + public List lookup(String key, boolean onlyMorePopular, int num) { + if (key.length() == 0 || automaton == null) { + // Keep the result an ArrayList to keep calls monomorphic. + return EMPTY_RESULT; + } + + try { + if (!onlyMorePopular && rootArcs.length > 1) { + // We could emit a warning here (?). An optimal strategy for alphabetically sorted + // suggestions would be to add them with a constant weight -- this saves unnecessary + // traversals and sorting. + return lookupSortedAlphabetically(key, num); + } else { + return lookupSortedByWeight(key, num, true); + } + } catch (IOException e) { + // Should never happen, but anyway. + throw new RuntimeException(e); + } + } + + /** + * Lookup suggestions sorted alphabetically if weights are not constant. This + * is a workaround: in general, use constant weights for alphabetically sorted result. + */ + private List lookupSortedAlphabetically(String key, int num) throws IOException { + // Greedily get num results from each weight branch. + List res = lookupSortedByWeight(key, num, false); + + // Sort and trim. + Collections.sort(res, new Comparator() { + @Override + public int compare(LookupResult o1, LookupResult o2) { + return o1.key.compareTo(o2.key); + } + }); + if (res.size() > num) { + res = res.subList(0, num); + } + return res; + } + + /** + * Lookup suggestions sorted by weight (descending order). + * + * @param greedy If true, the routine terminates immediately when num + * suggestions have been collected. If false, it will collect suggestions from + * all weight arcs (needed for {@link #lookupSortedAlphabetically}. + */ + private ArrayList lookupSortedByWeight(String key, int num, boolean greedy) throws IOException { + final ArrayList res = new ArrayList(Math.min(10, num)); + final StringBuilder output = new StringBuilder(key); + final int matchLength = key.length() - 1; + + for (int i = 0; i < rootArcs.length; i++) { + final FST.Arc rootArc = rootArcs[i]; + final FST.Arc arc = new FST.Arc().copyFrom(rootArc); + + // Descend into the automaton using the key as prefix. + if (descendWithPrefix(arc, key)) { + // Prefix-encoded weight. + final float weight = rootArc.label / (float) buckets; + + // A subgraph starting from the current node has the completions + // of the key prefix. The arc we're at is the last key's byte, + // so we will collect it too. + output.setLength(matchLength); + if (collect(res, num, weight, output, arc) && greedy) { + // We have enough suggestion to return immediately. Keep on looking for an + // exact match, if requested. + if (exactMatchFirst) { + Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key); + if (exactMatchWeight != null) { + res.add(0, new LookupResult(key, exactMatchWeight)); + while (res.size() > num) { + res.remove(res.size() - 1); + } + } + } + break; + } + } + } + return res; + } + + /** + * Descend along the path starting at arc and going through + * bytes in utf8 argument. + * + * @param arc The starting arc. This argument is modified in-place. + * @param term The term to descend with. + * @return If true, arc will be set to the arc matching + * last byte of utf8. false is returned if no such + * prefix utf8 exists. + */ + private boolean descendWithPrefix(Arc arc, String term) throws IOException { + final int max = term.length(); + + for (int i = 0; i < max; i++) { + if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) { + // No matching prefixes, return an empty result. + return false; + } + } + + return true; + } + + /** + * Recursive collect lookup results from the automaton subgraph starting at arc. + * + * @param num Maximum number of results needed (early termination). + * @param weight Weight of all results found during this collection. + */ + private boolean collect(List res, int num, float weight, StringBuilder output, Arc arc) throws IOException { + output.append((char) arc.label); + + automaton.readFirstTargetArc(arc, arc); + while (true) { + if (arc.label == FST.END_LABEL) { + res.add(new LookupResult(output.toString(), weight)); + if (res.size() >= num) + return true; + } else { + int save = output.length(); + if (collect(res, num, weight, output, new Arc().copyFrom(arc))) { + return true; + } + output.setLength(save); + } + + if (arc.isLast()) { + break; + } + automaton.readNextArc(arc); + } + return false; + } + + /** + * Builds the final automaton from a list of entries. + */ + private FST buildAutomaton(List entries) throws IOException { + if (entries.size() == 0) + return null; + + // Sort by utf16 (raw char value) + final Comparator comp = new Comparator() { + public int compare(Entry o1, Entry o2) { + char [] ch1 = o1.term; + char [] ch2 = o2.term; + int len1 = ch1.length; + int len2 = ch2.length; + + int max = Math.min(len1, len2); + for (int i = 0; i < max; i++) { + int v = ch1[i] - ch2[i]; + if (v != 0) return v; + } + return len1 - len2; + } + }; + Collections.sort(entries, comp); + + // Avoid duplicated identical entries, if possible. This is required because + // it breaks automaton construction otherwise. + int len = entries.size(); + int j = 0; + for (int i = 1; i < len; i++) { + if (comp.compare(entries.get(j), entries.get(i)) != 0) { + entries.set(++j, entries.get(i)); + } + } + entries = entries.subList(0, j + 1); + + // Build the automaton. + final Outputs outputs = NoOutputs.getSingleton(); + final Object empty = outputs.getNoOutput(); + final Builder builder = + new Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs); + final IntsRef scratchIntsRef = new IntsRef(10); + for (Entry e : entries) { + final int termLength = scratchIntsRef.length = e.term.length; + + scratchIntsRef.grow(termLength); + final int [] ints = scratchIntsRef.ints; + final char [] chars = e.term; + for (int i = termLength; --i >= 0;) { + ints[i] = chars[i]; + } + builder.add(scratchIntsRef, empty); + } + return builder.finish(); + } + + /** + * Prepends the entry's weight to each entry, encoded as a single byte, so that the + * root automaton node fans out to all possible priorities, starting with the arc that has + * the highest weights. + */ + private void encodeWeightPrefix(List entries) { + for (Entry e : entries) { + int weight = (int) e.weight; + assert (weight >= 0 && weight <= buckets) : + "Weight out of range: " + weight + " [" + buckets + "]"; + + // There should be a single empty char reserved in front for the weight. + e.term[0] = (char) weight; + } + } + + /** + * Split [min, max] range into buckets, reassigning weights. Entries' weights are + * remapped to [0, buckets] range (so, buckets + 1 buckets, actually). + */ + private void redistributeWeightsProportionalMinMax(List entries, int buckets) { + float min = entries.get(0).weight; + float max = min; + for (Entry e : entries) { + min = Math.min(e.weight, min); + max = Math.max(e.weight, max); + } + + final float range = max - min; + for (Entry e : entries) { + e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor() + } + } + + /** + * Deserialization from disk. + */ + @Override + public synchronized boolean load(File storeDir) throws IOException { + File data = new File(storeDir, FILENAME); + if (!data.exists() || !data.canRead()) { + return false; + } + + InputStream is = new BufferedInputStream(new FileInputStream(data)); + try { + this.automaton = new FST(new InputStreamDataInput(is), NoOutputs.getSingleton()); + cacheRootArcs(); + } finally { + IOUtils.closeSafely(is); + } + return true; + } + + /** + * Serialization to disk. + */ + @Override + public synchronized boolean store(File storeDir) throws IOException { + if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { + return false; + } + + if (this.automaton == null) + return false; + + File data = new File(storeDir, FILENAME); + OutputStream os = new BufferedOutputStream(new FileOutputStream(data)); + try { + this.automaton.save(new OutputStreamDataOutput(os)); + } finally { + IOUtils.closeSafely(os); + } + + return true; + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InputStreamDataInput.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InputStreamDataInput.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InputStreamDataInput.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InputStreamDataInput.java 2011-05-22 17:16:29.000000000 -0400 @@ -0,0 +1,32 @@ +package org.apache.lucene.search.suggest.fst; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import org.apache.lucene.store.DataInput; + +/** + * A {@link DataInput} wrapping a plain {@link InputStream}. + */ +public class InputStreamDataInput extends DataInput { + + private final InputStream is; + + public InputStreamDataInput(InputStream is) { + this.is = is; + } + + @Override + public byte readByte() throws IOException { + int v = is.read(); + if (v == -1) throw new EOFException(); + return (byte) v; + } + + @Override + public void readBytes(byte[] b, int offset, int len) throws IOException { + if (is.read(b, offset, len) != len) { + throw new EOFException(); + } + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/OutputStreamDataOutput.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/OutputStreamDataOutput.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/OutputStreamDataOutput.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/OutputStreamDataOutput.java 2011-05-22 17:04:53.000000000 -0400 @@ -0,0 +1,28 @@ +package org.apache.lucene.search.suggest.fst; + +import java.io.IOException; +import java.io.OutputStream; + +import org.apache.lucene.store.DataOutput; + +/** + * A {@link DataOutput} wrapping a plain {@link OutputStream}. + */ +public class OutputStreamDataOutput extends DataOutput { + + private final OutputStream os; + + public OutputStreamDataOutput(OutputStream os) { + this.os = os; + } + + @Override + public void writeByte(byte b) throws IOException { + os.write(b); + } + + @Override + public void writeBytes(byte[] b, int offset, int length) throws IOException { + os.write(b, offset, length); + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java 2011-05-22 18:07:36.000000000 -0400 @@ -0,0 +1,172 @@ +package org.apache.lucene.search.suggest.jaspell; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.search.spell.SortedIterator; +import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.UnsortedTermFreqIteratorWrapper; +import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie.TSTNode; + +public class JaspellLookup extends Lookup { + JaspellTernarySearchTrie trie = new JaspellTernarySearchTrie(); + private boolean usePrefix = true; + private int editDistance = 2; + + @Override + public void build(TermFreqIterator tfit) throws IOException { + if (tfit instanceof SortedIterator) { + // make sure it's unsorted + tfit = new UnsortedTermFreqIteratorWrapper(tfit); + } + trie = new JaspellTernarySearchTrie(); + trie.setMatchAlmostDiff(editDistance); + while (tfit.hasNext()) { + String key = tfit.next(); + float freq = tfit.freq(); + if (key.length() == 0) { + continue; + } + trie.put(key, new Float(freq)); + } + } + + @Override + public boolean add(String key, Object value) { + trie.put(key, value); + // XXX + return false; + } + + @Override + public Object get(String key) { + return trie.get(key); + } + + @Override + public List lookup(String key, boolean onlyMorePopular, int num) { + List res = new ArrayList(); + List list; + int count = onlyMorePopular ? num * 2 : num; + if (usePrefix) { + list = trie.matchPrefix(key, count); + } else { + list = trie.matchAlmost(key, count); + } + if (list == null || list.size() == 0) { + return res; + + } + int maxCnt = Math.min(num, list.size()); + if (onlyMorePopular) { + LookupPriorityQueue queue = new LookupPriorityQueue(num); + for (String s : list) { + float freq = (Float)trie.get(s); + queue.insertWithOverflow(new LookupResult(s, freq)); + } + for (LookupResult lr : queue.getResults()) { + res.add(lr); + } + } else { + for (int i = 0; i < maxCnt; i++) { + String s = list.get(i); + float freq = (Float)trie.get(s); + res.add(new LookupResult(s, freq)); + } + } + return res; + } + + public static final String FILENAME = "jaspell.dat"; + private static final byte LO_KID = 0x01; + private static final byte EQ_KID = 0x02; + private static final byte HI_KID = 0x04; + private static final byte HAS_VALUE = 0x08; + + + @Override + public boolean load(File storeDir) throws IOException { + File data = new File(storeDir, FILENAME); + if (!data.exists() || !data.canRead()) { + return false; + } + DataInputStream in = new DataInputStream(new FileInputStream(data)); + TSTNode root = trie.new TSTNode('\0', null); + try { + readRecursively(in, root); + trie.setRoot(root); + } finally { + in.close(); + } + return true; + } + + private void readRecursively(DataInputStream in, TSTNode node) throws IOException { + node.splitchar = in.readChar(); + byte mask = in.readByte(); + if ((mask & HAS_VALUE) != 0) { + node.data = new Float(in.readFloat()); + } + if ((mask & LO_KID) != 0) { + TSTNode kid = trie.new TSTNode('\0', node); + node.relatives[TSTNode.LOKID] = kid; + readRecursively(in, kid); + } + if ((mask & EQ_KID) != 0) { + TSTNode kid = trie.new TSTNode('\0', node); + node.relatives[TSTNode.EQKID] = kid; + readRecursively(in, kid); + } + if ((mask & HI_KID) != 0) { + TSTNode kid = trie.new TSTNode('\0', node); + node.relatives[TSTNode.HIKID] = kid; + readRecursively(in, kid); + } + } + + @Override + public boolean store(File storeDir) throws IOException { + if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { + return false; + } + TSTNode root = trie.getRoot(); + if (root == null) { // empty tree + return false; + } + File data = new File(storeDir, FILENAME); + DataOutputStream out = new DataOutputStream(new FileOutputStream(data)); + try { + writeRecursively(out, root); + out.flush(); + } finally { + out.close(); + } + return true; + } + + private void writeRecursively(DataOutputStream out, TSTNode node) throws IOException { + if (node == null) { + return; + } + out.writeChar(node.splitchar); + byte mask = 0; + if (node.relatives[TSTNode.LOKID] != null) mask |= LO_KID; + if (node.relatives[TSTNode.EQKID] != null) mask |= EQ_KID; + if (node.relatives[TSTNode.HIKID] != null) mask |= HI_KID; + if (node.data != null) mask |= HAS_VALUE; + out.writeByte(mask); + if (node.data != null) { + out.writeFloat((Float)node.data); + } + writeRecursively(out, node.relatives[TSTNode.LOKID]); + writeRecursively(out, node.relatives[TSTNode.EQKID]); + writeRecursively(out, node.relatives[TSTNode.HIKID]); + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellTernarySearchTrie.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellTernarySearchTrie.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellTernarySearchTrie.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellTernarySearchTrie.java 2011-05-22 17:05:14.000000000 -0400 @@ -0,0 +1,866 @@ +package org.apache.lucene.search.suggest.jaspell; + +/** + * Copyright (c) 2005 Bruno Martins + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the organization nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.List; +import java.util.Vector; +import java.util.zip.GZIPInputStream; + +/** + * Implementation of a Ternary Search Trie, a data structure for storing + * String objects that combines the compact size of a binary search + * tree with the speed of a digital search trie, and is therefore ideal for + * practical use in sorting and searching data.

+ *

+ * + * This data structure is faster than hashing for many typical search problems, + * and supports a broader range of useful problems and operations. Ternary + * searches are faster than hashing and more powerful, too. + *

+ *

+ * + * The theory of ternary search trees was described at a symposium in 1997 (see + * "Fast Algorithms for Sorting and Searching Strings," by J.L. Bentley and R. + * Sedgewick, Proceedings of the 8th Annual ACM-SIAM Symposium on Discrete + * Algorithms, January 1997). Algorithms in C, Third Edition, by Robert + * Sedgewick (Addison-Wesley, 1998) provides yet another view of ternary search + * trees. + * + * @author Bruno Martins + * + */ +public class JaspellTernarySearchTrie { + + /** + * An inner class of Ternary Search Trie that represents a node in the trie. + */ + protected final class TSTNode { + + /** Index values for accessing relatives array. */ + protected final static int PARENT = 0, LOKID = 1, EQKID = 2, HIKID = 3; + + /** The key to the node. */ + protected Object data; + + /** The relative nodes. */ + protected TSTNode[] relatives = new TSTNode[4]; + + /** The char used in the split. */ + protected char splitchar; + + /** + * Constructor method. + * + *@param splitchar + * The char used in the split. + *@param parent + * The parent node. + */ + protected TSTNode(char splitchar, TSTNode parent) { + this.splitchar = splitchar; + relatives[PARENT] = parent; + } + } + + /** + * Compares characters by alfabetical order. + * + *@param cCompare2 + * The first char in the comparison. + *@param cRef + * The second char in the comparison. + *@return A negative number, 0 or a positive number if the second char is + * less, equal or greater. + */ + private static int compareCharsAlphabetically(char cCompare2, char cRef) { + return Character.toLowerCase(cCompare2) - Character.toLowerCase(cRef); + } + + /* what follows is the original Jaspell code. + private static int compareCharsAlphabetically(int cCompare2, int cRef) { + int cCompare = 0; + if (cCompare2 >= 65) { + if (cCompare2 < 89) { + cCompare = (2 * cCompare2) - 65; + } else if (cCompare2 < 97) { + cCompare = cCompare2 + 24; + } else if (cCompare2 < 121) { + cCompare = (2 * cCompare2) - 128; + } else cCompare = cCompare2; + } else cCompare = cCompare2; + if (cRef < 65) { + return cCompare - cRef; + } + if (cRef < 89) { + return cCompare - ((2 * cRef) - 65); + } + if (cRef < 97) { + return cCompare - (cRef + 24); + } + if (cRef < 121) { + return cCompare - ((2 * cRef) - 128); + } + return cCompare - cRef; + } + */ + + /** + * The default number of values returned by the matchAlmost + * method. + */ + private int defaultNumReturnValues = -1; + + /** + * the number of differences allowed in a call to the + * matchAlmostKey method. + */ + private int matchAlmostDiff; + + /** The base node in the trie. */ + private TSTNode rootNode; + + /** + * Constructs an empty Ternary Search Trie. + */ + public JaspellTernarySearchTrie() { + } + + // for loading + void setRoot(TSTNode newRoot) { + rootNode = newRoot; + } + + // for saving + TSTNode getRoot() { + return rootNode; + } + + /** + * Constructs a Ternary Search Trie and loads data from a File + * into the Trie. The file is a normal text document, where each line is of + * the form word TAB float. + * + *@param file + * The File with the data to load into the Trie. + *@exception IOException + * A problem occured while reading the data. + */ + public JaspellTernarySearchTrie(File file) throws IOException { + this(file, false); + } + + /** + * Constructs a Ternary Search Trie and loads data from a File + * into the Trie. The file is a normal text document, where each line is of + * the form "word TAB float". + * + *@param file + * The File with the data to load into the Trie. + *@param compression + * If true, the file is compressed with the GZIP algorithm, and if + * false, the file is a normal text document. + *@exception IOException + * A problem occured while reading the data. + */ + public JaspellTernarySearchTrie(File file, boolean compression) + throws IOException { + this(); + BufferedReader in; + if (compression) + in = new BufferedReader(new InputStreamReader(new GZIPInputStream( + new FileInputStream(file)))); + else in = new BufferedReader(new InputStreamReader((new FileInputStream( + file)))); + String word; + int pos; + Float occur, one = new Float(1); + int numWords = 0; + while ((word = in.readLine()) != null) { + numWords++; + pos = word.indexOf("\t"); + occur = one; + if (pos != -1) { + occur = Float.parseFloat(word.substring(pos + 1).trim()); + word = word.substring(0, pos); + } + String key = word.toLowerCase(); + if (rootNode == null) { + rootNode = new TSTNode(key.charAt(0), null); + } + TSTNode node = null; + if (key.length() > 0 && rootNode != null) { + TSTNode currentNode = rootNode; + int charIndex = 0; + while (true) { + if (currentNode == null) break; + int charComp = compareCharsAlphabetically(key.charAt(charIndex), + currentNode.splitchar); + if (charComp == 0) { + charIndex++; + if (charIndex == key.length()) { + node = currentNode; + break; + } + currentNode = currentNode.relatives[TSTNode.EQKID]; + } else if (charComp < 0) { + currentNode = currentNode.relatives[TSTNode.LOKID]; + } else { + currentNode = currentNode.relatives[TSTNode.HIKID]; + } + } + Float occur2 = null; + if (node != null) occur2 = ((Float) (node.data)); + if (occur2 != null) { + occur += occur2.floatValue(); + } + currentNode = getOrCreateNode(word.trim().toLowerCase()); + currentNode.data = occur; + } + } + in.close(); + } + + /** + * Deletes the node passed in as an argument. If this node has non-null data, + * then both the node and the data will be deleted. It also deletes any other + * nodes in the trie that are no longer needed after the deletion of the node. + * + *@param nodeToDelete + * The node to delete. + */ + private void deleteNode(TSTNode nodeToDelete) { + if (nodeToDelete == null) { + return; + } + nodeToDelete.data = null; + while (nodeToDelete != null) { + nodeToDelete = deleteNodeRecursion(nodeToDelete); + // deleteNodeRecursion(nodeToDelete); + } + } + + /** + * Recursively visits each node to be deleted. + * + * To delete a node, first set its data to null, then pass it into this + * method, then pass the node returned by this method into this method (make + * sure you don't delete the data of any of the nodes returned from this + * method!) and continue in this fashion until the node returned by this + * method is null. + * + * The TSTNode instance returned by this method will be next node to be + * operated on by deleteNodeRecursion (This emulates recursive + * method call while avoiding the JVM overhead normally associated with a + * recursive method.) + * + *@param currentNode + * The node to delete. + *@return The next node to be called in deleteNodeRecursion. + */ + private TSTNode deleteNodeRecursion(TSTNode currentNode) { + if (currentNode == null) { + return null; + } + if (currentNode.relatives[TSTNode.EQKID] != null + || currentNode.data != null) { + return null; + } + // can't delete this node if it has a non-null eq kid or data + TSTNode currentParent = currentNode.relatives[TSTNode.PARENT]; + boolean lokidNull = currentNode.relatives[TSTNode.LOKID] == null; + boolean hikidNull = currentNode.relatives[TSTNode.HIKID] == null; + int childType; + if (currentParent.relatives[TSTNode.LOKID] == currentNode) { + childType = TSTNode.LOKID; + } else if (currentParent.relatives[TSTNode.EQKID] == currentNode) { + childType = TSTNode.EQKID; + } else if (currentParent.relatives[TSTNode.HIKID] == currentNode) { + childType = TSTNode.HIKID; + } else { + rootNode = null; + return null; + } + if (lokidNull && hikidNull) { + currentParent.relatives[childType] = null; + return currentParent; + } + if (lokidNull) { + currentParent.relatives[childType] = currentNode.relatives[TSTNode.HIKID]; + currentNode.relatives[TSTNode.HIKID].relatives[TSTNode.PARENT] = currentParent; + return currentParent; + } + if (hikidNull) { + currentParent.relatives[childType] = currentNode.relatives[TSTNode.LOKID]; + currentNode.relatives[TSTNode.LOKID].relatives[TSTNode.PARENT] = currentParent; + return currentParent; + } + int deltaHi = currentNode.relatives[TSTNode.HIKID].splitchar + - currentNode.splitchar; + int deltaLo = currentNode.splitchar + - currentNode.relatives[TSTNode.LOKID].splitchar; + int movingKid; + TSTNode targetNode; + if (deltaHi == deltaLo) { + if (Math.random() < 0.5) { + deltaHi++; + } else { + deltaLo++; + } + } + if (deltaHi > deltaLo) { + movingKid = TSTNode.HIKID; + targetNode = currentNode.relatives[TSTNode.LOKID]; + } else { + movingKid = TSTNode.LOKID; + targetNode = currentNode.relatives[TSTNode.HIKID]; + } + while (targetNode.relatives[movingKid] != null) { + targetNode = targetNode.relatives[movingKid]; + } + targetNode.relatives[movingKid] = currentNode.relatives[movingKid]; + currentParent.relatives[childType] = targetNode; + targetNode.relatives[TSTNode.PARENT] = currentParent; + if (!lokidNull) { + currentNode.relatives[TSTNode.LOKID] = null; + } + if (!hikidNull) { + currentNode.relatives[TSTNode.HIKID] = null; + } + return currentParent; + } + + /** + * Retrieve the object indexed by a key. + * + *@param key + * A String index. + *@return The object retrieved from the Ternary Search Trie. + */ + public Object get(String key) { + TSTNode node = getNode(key.trim().toLowerCase()); + if (node == null) { + return null; + } + return node.data; + } + + /** + * Retrieve the Float indexed by key, increment it by one unit + * and store the new Float. + * + *@param key + * A String index. + *@return The Float retrieved from the Ternary Search Trie. + */ + public Float getAndIncrement(String key) { + String key2 = key.trim().toLowerCase(); + TSTNode node = getNode(key2); + if (node == null) { + return null; + } + Float aux = (Float) (node.data); + if (aux == null) { + aux = new Float(1); + } else { + aux = new Float(aux.intValue() + 1); + } + put(key2, aux); + return aux; + } + + /** + * Returns the key that indexes the node argument. + * + *@param node + * The node whose index is to be calculated. + *@return The String that indexes the node argument. + */ + protected String getKey(TSTNode node) { + StringBuffer getKeyBuffer = new StringBuffer(); + getKeyBuffer.setLength(0); + getKeyBuffer.append("" + node.splitchar); + TSTNode currentNode; + TSTNode lastNode; + currentNode = node.relatives[TSTNode.PARENT]; + lastNode = node; + while (currentNode != null) { + if (currentNode.relatives[TSTNode.EQKID] == lastNode) { + getKeyBuffer.append("" + currentNode.splitchar); + } + lastNode = currentNode; + currentNode = currentNode.relatives[TSTNode.PARENT]; + } + getKeyBuffer.reverse(); + return getKeyBuffer.toString(); + } + + /** + * Returns the node indexed by key, or null if that node doesn't + * exist. Search begins at root node. + * + *@param key + * A String that indexes the node that is returned. + *@return The node object indexed by key. This object is an instance of an + * inner class named TernarySearchTrie.TSTNode. + */ + public TSTNode getNode(String key) { + return getNode(key, rootNode); + } + + /** + * Returns the node indexed by key, or null if that node doesn't + * exist. The search begins at root node. + * + *@param key2 + * A String that indexes the node that is returned. + *@param startNode + * The top node defining the subtrie to be searched. + *@return The node object indexed by key. This object is an instance of an + * inner class named TernarySearchTrie.TSTNode. + */ + protected TSTNode getNode(String key2, TSTNode startNode) { + String key = key2.trim().toLowerCase(); + if (key == null || startNode == null || key.length() == 0) { + return null; + } + TSTNode currentNode = startNode; + int charIndex = 0; + while (true) { + if (currentNode == null) { + return null; + } + int charComp = compareCharsAlphabetically(key.charAt(charIndex), + currentNode.splitchar); + if (charComp == 0) { + charIndex++; + if (charIndex == key.length()) { + return currentNode; + } + currentNode = currentNode.relatives[TSTNode.EQKID]; + } else if (charComp < 0) { + currentNode = currentNode.relatives[TSTNode.LOKID]; + } else { + currentNode = currentNode.relatives[TSTNode.HIKID]; + } + } + } + + /** + * Returns the node indexed by key, creating that node if it doesn't exist, + * and creating any required intermediate nodes if they don't exist. + * + *@param key + * A String that indexes the node that is returned. + *@return The node object indexed by key. This object is an instance of an + * inner class named TernarySearchTrie.TSTNode. + *@exception NullPointerException + * If the key is null. + *@exception IllegalArgumentException + * If the key is an empty String. + */ + protected TSTNode getOrCreateNode(String key) throws NullPointerException, + IllegalArgumentException { + if (key == null) { + throw new NullPointerException( + "attempt to get or create node with null key"); + } + if (key.length() == 0) { + throw new IllegalArgumentException( + "attempt to get or create node with key of zero length"); + } + if (rootNode == null) { + rootNode = new TSTNode(key.charAt(0), null); + } + TSTNode currentNode = rootNode; + int charIndex = 0; + while (true) { + int charComp = compareCharsAlphabetically(key.charAt(charIndex), + currentNode.splitchar); + if (charComp == 0) { + charIndex++; + if (charIndex == key.length()) { + return currentNode; + } + if (currentNode.relatives[TSTNode.EQKID] == null) { + currentNode.relatives[TSTNode.EQKID] = new TSTNode(key + .charAt(charIndex), currentNode); + } + currentNode = currentNode.relatives[TSTNode.EQKID]; + } else if (charComp < 0) { + if (currentNode.relatives[TSTNode.LOKID] == null) { + currentNode.relatives[TSTNode.LOKID] = new TSTNode(key + .charAt(charIndex), currentNode); + } + currentNode = currentNode.relatives[TSTNode.LOKID]; + } else { + if (currentNode.relatives[TSTNode.HIKID] == null) { + currentNode.relatives[TSTNode.HIKID] = new TSTNode(key + .charAt(charIndex), currentNode); + } + currentNode = currentNode.relatives[TSTNode.HIKID]; + } + } + } + + /** + * Returns a List of keys that almost match the argument key. + * Keys returned will have exactly diff characters that do not match the + * target key, where diff is equal to the last value passed in as an argument + * to the setMatchAlmostDiff method. + *

+ * If the matchAlmost method is called before the + * setMatchAlmostDiff method has been called for the first time, + * then diff = 0. + * + *@param key + * The target key. + *@return A List with the results. + */ + public List matchAlmost(String key) { + return matchAlmost(key, defaultNumReturnValues); + } + + /** + * Returns a List of keys that almost match the argument key. + * Keys returned will have exactly diff characters that do not match the + * target key, where diff is equal to the last value passed in as an argument + * to the setMatchAlmostDiff method. + *

+ * If the matchAlmost method is called before the + * setMatchAlmostDiff method has been called for the first time, + * then diff = 0. + * + *@param key + * The target key. + *@param numReturnValues + * The maximum number of values returned by this method. + *@return A List with the results + */ + public List matchAlmost(String key, int numReturnValues) { + return matchAlmostRecursion(rootNode, 0, matchAlmostDiff, key, + ((numReturnValues < 0) ? -1 : numReturnValues), new Vector(), false); + } + + /** + * Recursivelly vists the nodes in order to find the ones that almost match a + * given key. + * + *@param currentNode + * The current node. + *@param charIndex + * The current char. + *@param d + * The number of differences so far. + *@param matchAlmostNumReturnValues + * The maximum number of values in the result List. + *@param matchAlmostResult2 + * The results so far. + *@param upTo + * If true all keys having up to and including matchAlmostDiff + * mismatched letters will be included in the result (including a key + * that is exactly the same as the target string) otherwise keys will + * be included in the result only if they have exactly + * matchAlmostDiff number of mismatched letters. + *@param matchAlmostKey + * The key being searched. + *@return A List with the results. + */ + private List matchAlmostRecursion(TSTNode currentNode, int charIndex, + int d, String matchAlmostKey, int matchAlmostNumReturnValues, + List matchAlmostResult2, boolean upTo) { + if ((currentNode == null) + || (matchAlmostNumReturnValues != -1 && matchAlmostResult2.size() >= matchAlmostNumReturnValues) + || (d < 0) || (charIndex >= matchAlmostKey.length())) { + return matchAlmostResult2; + } + int charComp = compareCharsAlphabetically(matchAlmostKey.charAt(charIndex), + currentNode.splitchar); + List matchAlmostResult = matchAlmostResult2; + if ((d > 0) || (charComp < 0)) { + matchAlmostResult = matchAlmostRecursion( + currentNode.relatives[TSTNode.LOKID], charIndex, d, + matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, + upTo); + } + int nextD = (charComp == 0) ? d : d - 1; + boolean cond = (upTo) ? (nextD >= 0) : (nextD == 0); + if ((matchAlmostKey.length() == charIndex + 1) && cond + && (currentNode.data != null)) { + matchAlmostResult.add(getKey(currentNode)); + } + matchAlmostResult = matchAlmostRecursion( + currentNode.relatives[TSTNode.EQKID], charIndex + 1, nextD, + matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, upTo); + if ((d > 0) || (charComp > 0)) { + matchAlmostResult = matchAlmostRecursion( + currentNode.relatives[TSTNode.HIKID], charIndex, d, + matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, + upTo); + } + return matchAlmostResult; + } + + /** + * Returns an alphabetical List of all keys in the trie that + * begin with a given prefix. Only keys for nodes having non-null data are + * included in the List. + * + *@param prefix + * Each key returned from this method will begin with the characters + * in prefix. + *@return A List with the results. + */ + public List matchPrefix(String prefix) { + return matchPrefix(prefix, defaultNumReturnValues); + } + + /** + * Returns an alphabetical List of all keys in the trie that + * begin with a given prefix. Only keys for nodes having non-null data are + * included in the List. + * + *@param prefix + * Each key returned from this method will begin with the characters + * in prefix. + *@param numReturnValues + * The maximum number of values returned from this method. + *@return A List with the results + */ + public List matchPrefix(String prefix, int numReturnValues) { + Vector sortKeysResult = new Vector(); + TSTNode startNode = getNode(prefix); + if (startNode == null) { + return sortKeysResult; + } + if (startNode.data != null) { + sortKeysResult.addElement(getKey(startNode)); + } + return sortKeysRecursion(startNode.relatives[TSTNode.EQKID], + ((numReturnValues < 0) ? -1 : numReturnValues), sortKeysResult); + } + + /** + * Returns the number of nodes in the trie that have non-null data. + * + *@return The number of nodes in the trie that have non-null data. + */ + public int numDataNodes() { + return numDataNodes(rootNode); + } + + /** + * Returns the number of nodes in the subtrie below and including the starting + * node. The method counts only nodes that have non-null data. + * + *@param startingNode + * The top node of the subtrie. the node that defines the subtrie. + *@return The total number of nodes in the subtrie. + */ + protected int numDataNodes(TSTNode startingNode) { + return recursiveNodeCalculator(startingNode, true, 0); + } + + /** + * Returns the total number of nodes in the trie. The method counts nodes + * whether or not they have data. + * + *@return The total number of nodes in the trie. + */ + public int numNodes() { + return numNodes(rootNode); + } + + /** + * Returns the total number of nodes in the subtrie below and including the + * starting Node. The method counts nodes whether or not they have data. + * + *@param startingNode + * The top node of the subtrie. The node that defines the subtrie. + *@return The total number of nodes in the subtrie. + */ + protected int numNodes(TSTNode startingNode) { + return recursiveNodeCalculator(startingNode, false, 0); + } + + /** + * Stores a value in the trie. The value may be retrieved using the key. + * + *@param key + * A String that indexes the object to be stored. + *@param value + * The object to be stored in the Trie. + */ + public void put(String key, Object value) { + getOrCreateNode(key.trim().toLowerCase()).data = value; + } + + /** + * Recursivelly visists each node to calculate the number of nodes. + * + *@param currentNode + * The current node. + *@param checkData + * If true we check the data to be different of null. + *@param numNodes2 + * The number of nodes so far. + *@return The number of nodes accounted. + */ + private int recursiveNodeCalculator(TSTNode currentNode, boolean checkData, + int numNodes2) { + if (currentNode == null) { + return numNodes2; + } + int numNodes = recursiveNodeCalculator( + currentNode.relatives[TSTNode.LOKID], checkData, numNodes2); + numNodes = recursiveNodeCalculator(currentNode.relatives[TSTNode.EQKID], + checkData, numNodes); + numNodes = recursiveNodeCalculator(currentNode.relatives[TSTNode.HIKID], + checkData, numNodes); + if (checkData) { + if (currentNode.data != null) { + numNodes++; + } + } else { + numNodes++; + } + return numNodes; + } + + /** + * Removes the value indexed by key. Also removes all nodes that are rendered + * unnecessary by the removal of this data. + * + *@param key + * A string that indexes the object to be removed from + * the Trie. + */ + public void remove(String key) { + deleteNode(getNode(key.trim().toLowerCase())); + } + + /** + * Sets the number of characters by which words can differ from target word + * when calling the matchAlmost method. + *

+ * Arguments less than 0 will set the char difference to 0, and arguments + * greater than 3 will set the char difference to 3. + * + *@param diff + * The number of characters by which words can differ from target + * word. + */ + public void setMatchAlmostDiff(int diff) { + if (diff < 0) { + matchAlmostDiff = 0; + } else if (diff > 3) { + matchAlmostDiff = 3; + } else { + matchAlmostDiff = diff; + } + } + + /** + * Sets the default maximum number of values returned from the + * matchPrefix and matchAlmost methods. + *

+ * The value should be set this to -1 to get an unlimited number of return + * values. note that the methods mentioned above provide overloaded versions + * that allow you to specify the maximum number of return values, in which + * case this value is temporarily overridden. + * + **@param num + * The number of values that will be returned when calling the + * methods above. + */ + public void setNumReturnValues(int num) { + defaultNumReturnValues = (num < 0) ? -1 : num; + } + + /** + * Returns keys sorted in alphabetical order. This includes the start Node and + * all nodes connected to the start Node. + *

+ * The number of keys returned is limited to numReturnValues. To get a list + * that isn't limited in size, set numReturnValues to -1. + * + *@param startNode + * The top node defining the subtrie to be searched. + *@param numReturnValues + * The maximum number of values returned from this method. + *@return A List with the results. + */ + protected List sortKeys(TSTNode startNode, int numReturnValues) { + return sortKeysRecursion(startNode, ((numReturnValues < 0) ? -1 + : numReturnValues), new Vector()); + } + + /** + * Returns keys sorted in alphabetical order. This includes the current Node + * and all nodes connected to the current Node. + *

+ * Sorted keys will be appended to the end of the resulting List. + * The result may be empty when this method is invoked, but may not be + * null. + * + *@param currentNode + * The current node. + *@param sortKeysNumReturnValues + * The maximum number of values in the result. + *@param sortKeysResult2 + * The results so far. + *@return A List with the results. + */ + private List sortKeysRecursion(TSTNode currentNode, + int sortKeysNumReturnValues, List sortKeysResult2) { + if (currentNode == null) { + return sortKeysResult2; + } + List sortKeysResult = sortKeysRecursion( + currentNode.relatives[TSTNode.LOKID], sortKeysNumReturnValues, + sortKeysResult2); + if (sortKeysNumReturnValues != -1 + && sortKeysResult.size() >= sortKeysNumReturnValues) { + return sortKeysResult; + } + if (currentNode.data != null) { + sortKeysResult.add(getKey(currentNode)); + } + sortKeysResult = sortKeysRecursion(currentNode.relatives[TSTNode.EQKID], + sortKeysNumReturnValues, sortKeysResult); + return sortKeysRecursion(currentNode.relatives[TSTNode.HIKID], + sortKeysNumReturnValues, sortKeysResult); + } + +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTAutocomplete.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTAutocomplete.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTAutocomplete.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTAutocomplete.java 2011-05-22 17:05:53.000000000 -0400 @@ -0,0 +1,142 @@ +package org.apache.lucene.search.suggest.tst; + +import java.util.*; + +public class TSTAutocomplete { + + /** + * Inserting keys in TST in the order middle,small,big (lexicographic measure) + * recursively creates a balanced tree which reduces insertion and search + * times significantly. + * + * @param tokens + * Sorted list of keys to be inserted in TST. + * @param lo + * stores the lower index of current list. + * @param hi + * stores the higher index of current list. + * @param root + * a reference object to root of TST. + */ + public void balancedTree(Object[] tokens, Object[] vals, int lo, int hi, + TernaryTreeNode root) { + if (lo > hi) return; + int mid = (lo + hi) / 2; + root = insert(root, (String) tokens[mid], vals[mid], 0); + balancedTree(tokens, vals, lo, mid - 1, root); + balancedTree(tokens, vals, mid + 1, hi, root); + } + + /** + * Inserts a key in TST creating a series of Binary Search Trees at each node. + * The key is actually stored across the eqKid of each node in a successive + * manner. + * + * @param currentNode + * a reference node where the insertion will take currently. + * @param s + * key to be inserted in TST. + * @param x + * index of character in key to be inserted currently. + * @return currentNode The new reference to root node of TST + */ + public TernaryTreeNode insert(TernaryTreeNode currentNode, String s, + Object val, int x) { + if (s == null || s.length() <= x) { + return currentNode; + } + if (currentNode == null) { + TernaryTreeNode newNode = new TernaryTreeNode(); + newNode.splitchar = s.charAt(x); + currentNode = newNode; + if (x < s.length() - 1) { + currentNode.eqKid = insert(currentNode.eqKid, s, val, x + 1); + } else { + currentNode.token = s; + currentNode.val = val; + return currentNode; + } + } else if (currentNode.splitchar > s.charAt(x)) { + currentNode.loKid = insert(currentNode.loKid, s, val, x); + } else if (currentNode.splitchar == s.charAt(x)) { + if (x < s.length() - 1) { + currentNode.eqKid = insert(currentNode.eqKid, s, val, x + 1); + } else { + currentNode.token = s; + currentNode.val = val; + return currentNode; + } + } else { + currentNode.hiKid = insert(currentNode.hiKid, s, val, x); + } + return currentNode; + } + + /** + * Auto-completes a given prefix query using Depth-First Search with the end + * of prefix as source node each time finding a new leaf to get a complete key + * to be added in the suggest list. + * + * @param root + * a reference to root node of TST. + * @param s + * prefix query to be auto-completed. + * @param x + * index of current character to be searched while traversing through + * the prefix in TST. + * @return suggest list of auto-completed keys for the given prefix query. + */ + public ArrayList prefixCompletion(TernaryTreeNode root, + String s, int x) { + + TernaryTreeNode p = root; + ArrayList suggest = new ArrayList(); + + while (p != null) { + if (s.charAt(x) < p.splitchar) { + p = p.loKid; + } else if (s.charAt(x) == p.splitchar) { + if (x == s.length() - 1) { + break; + } else { + x++; + } + p = p.eqKid; + } else { + p = p.hiKid; + } + } + + if (p == null) return suggest; + if (p.eqKid == null && p.token == null) return suggest; + if (p.eqKid == null && p.token != null) { + suggest.add(p); + return suggest; + } + + if (p.token != null) { + suggest.add(p); + } + p = p.eqKid; + + Stack st = new Stack(); + st.push(p); + while (!st.empty()) { + TernaryTreeNode top = st.peek(); + st.pop(); + if (top.token != null) { + suggest.add(top); + } + if (top.eqKid != null) { + st.push(top.eqKid); + } + if (top.loKid != null) { + st.push(top.loKid); + } + if (top.hiKid != null) { + st.push(top.hiKid); + } + } + return suggest; + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java 2011-05-22 18:07:54.000000000 -0400 @@ -0,0 +1,174 @@ +package org.apache.lucene.search.suggest.tst; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper; +import org.apache.lucene.search.spell.SortedIterator; +import org.apache.lucene.search.spell.TermFreqIterator; + +public class TSTLookup extends Lookup { + TernaryTreeNode root = new TernaryTreeNode(); + TSTAutocomplete autocomplete = new TSTAutocomplete(); + + @Override + public void build(TermFreqIterator tfit) throws IOException { + root = new TernaryTreeNode(); + // buffer first + if (!(tfit instanceof SortedIterator)) { + // make sure it's sorted + tfit = new SortedTermFreqIteratorWrapper(tfit); + } + + ArrayList tokens = new ArrayList(); + ArrayList vals = new ArrayList(); + while (tfit.hasNext()) { + tokens.add(tfit.next()); + vals.add(new Float(tfit.freq())); + } + autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root); + } + + @Override + public boolean add(String key, Object value) { + autocomplete.insert(root, key, value, 0); + // XXX we don't know if a new node was created + return true; + } + + @Override + public Object get(String key) { + List list = autocomplete.prefixCompletion(root, key, 0); + if (list == null || list.isEmpty()) { + return null; + } + for (TernaryTreeNode n : list) { + if (n.token.equals(key)) { + return n.val; + } + } + return null; + } + + @Override + public List lookup(String key, boolean onlyMorePopular, int num) { + List list = autocomplete.prefixCompletion(root, key, 0); + List res = new ArrayList(); + if (list == null || list.size() == 0) { + return res; + } + int maxCnt = Math.min(num, list.size()); + if (onlyMorePopular) { + LookupPriorityQueue queue = new LookupPriorityQueue(num); + for (TernaryTreeNode ttn : list) { + queue.insertWithOverflow(new LookupResult(ttn.token, (Float)ttn.val)); + } + for (LookupResult lr : queue.getResults()) { + res.add(lr); + } + } else { + for (int i = 0; i < maxCnt; i++) { + TernaryTreeNode ttn = list.get(i); + res.add(new LookupResult(ttn.token, (Float)ttn.val)); + } + } + return res; + } + + public static final String FILENAME = "tst.dat"; + + private static final byte LO_KID = 0x01; + private static final byte EQ_KID = 0x02; + private static final byte HI_KID = 0x04; + private static final byte HAS_TOKEN = 0x08; + private static final byte HAS_VALUE = 0x10; + + @Override + public synchronized boolean load(File storeDir) throws IOException { + File data = new File(storeDir, FILENAME); + if (!data.exists() || !data.canRead()) { + return false; + } + DataInputStream in = new DataInputStream(new FileInputStream(data)); + root = new TernaryTreeNode(); + try { + readRecursively(in, root); + } finally { + in.close(); + } + return true; + } + + // pre-order traversal + private void readRecursively(DataInputStream in, TernaryTreeNode node) throws IOException { + node.splitchar = in.readChar(); + byte mask = in.readByte(); + if ((mask & HAS_TOKEN) != 0) { + node.token = in.readUTF(); + } + if ((mask & HAS_VALUE) != 0) { + node.val = new Float(in.readFloat()); + } + if ((mask & LO_KID) != 0) { + node.loKid = new TernaryTreeNode(); + readRecursively(in, node.loKid); + } + if ((mask & EQ_KID) != 0) { + node.eqKid = new TernaryTreeNode(); + readRecursively(in, node.eqKid); + } + if ((mask & HI_KID) != 0) { + node.hiKid = new TernaryTreeNode(); + readRecursively(in, node.hiKid); + } + } + + @Override + public synchronized boolean store(File storeDir) throws IOException { + if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { + return false; + } + File data = new File(storeDir, FILENAME); + DataOutputStream out = new DataOutputStream(new FileOutputStream(data)); + try { + writeRecursively(out, root); + out.flush(); + } finally { + out.close(); + } + return true; + } + + // pre-order traversal + private void writeRecursively(DataOutputStream out, TernaryTreeNode node) throws IOException { + // write out the current node + out.writeChar(node.splitchar); + // prepare a mask of kids + byte mask = 0; + if (node.eqKid != null) mask |= EQ_KID; + if (node.loKid != null) mask |= LO_KID; + if (node.hiKid != null) mask |= HI_KID; + if (node.token != null) mask |= HAS_TOKEN; + if (node.val != null) mask |= HAS_VALUE; + out.writeByte(mask); + if (node.token != null) out.writeUTF(node.token); + if (node.val != null) out.writeFloat((Float)node.val); + // recurse and write kids + if (node.loKid != null) { + writeRecursively(out, node.loKid); + } + if (node.eqKid != null) { + writeRecursively(out, node.eqKid); + } + if (node.hiKid != null) { + writeRecursively(out, node.hiKid); + } + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TernaryTreeNode.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TernaryTreeNode.java --- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TernaryTreeNode.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TernaryTreeNode.java 2011-05-22 17:05:45.000000000 -0400 @@ -0,0 +1,25 @@ +package org.apache.lucene.search.suggest.tst; + +/** + * The class creates a TST node. + */ + +public class TernaryTreeNode { + /** the character stored by a node. */ + char splitchar; + /** a reference object to the node containing character smaller than this node's character. */ + TernaryTreeNode loKid; + /** + * a reference object to the node containing character next to this node's character as + * occurring in the inserted token. + */ + TernaryTreeNode eqKid; + /** a reference object to the node containing character higher than this node's character. */ + TernaryTreeNode hiKid; + /** + * used by leaf nodes to store the complete tokens to be added to suggest list while + * auto-completing the prefix. + */ + String token; + Object val; +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,144 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; + +public class TestDirectSpellChecker extends LuceneTestCase { + + public void testSimpleExamples() throws Exception { + DirectSpellChecker spellChecker = new DirectSpellChecker(); + spellChecker.setMinQueryLength(0); + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, dir, + new MockAnalyzer(random, MockTokenizer.SIMPLE, true)); + + for (int i = 0; i < 20; i++) { + Document doc = new Document(); + doc.add(newField("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + + IndexReader ir = writer.getReader(); + + SuggestWord[] similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].string); + + similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir, false); + if (similar.length > 0) { + assertFalse(similar[0].string.equals("five")); // don't suggest a word for itself + } + + similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].string); + + similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].string); + + similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].string); + + assertTrue(similar.length > 0); + similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir, false); + assertEquals("five", similar[0].string); + + // add some more documents + for (int i = 1000; i < 1100; i++) { + Document doc = new Document(); + doc.add(newField("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + + ir.close(); + ir = writer.getReader(); + + // look ma, no spellcheck index rebuild + similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10, ir, false); + assertTrue(similar.length > 0); + assertEquals("thousand", similar[0].string); + + ir.close(); + writer.close(); + dir.close(); + } + + public void testOptions() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, dir, + new MockAnalyzer(random, MockTokenizer.SIMPLE, true)); + + Document doc = new Document(); + doc.add(newField("text", "foobar", Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + doc.add(newField("text", "foobar", Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + doc.add(newField("text", "foobaz", Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + doc.add(newField("text", "fobar", Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + + IndexReader ir = writer.getReader(); + + DirectSpellChecker spellChecker = new DirectSpellChecker(); + spellChecker.setMaxQueryFrequency(0F); + SuggestWord[] similar = spellChecker.suggestSimilar(new Term("text", "fobar"), 1, ir, true); + assertEquals(0, similar.length); + + spellChecker = new DirectSpellChecker(); // reset defaults + spellChecker.setMinQueryLength(5); + similar = spellChecker.suggestSimilar(new Term("text", "foba"), 1, ir, true); + assertEquals(0, similar.length); + + spellChecker = new DirectSpellChecker(); // reset defaults + spellChecker.setMaxEdits(1); + similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true); + assertEquals(0, similar.length); + + spellChecker = new DirectSpellChecker(); // reset defaults + spellChecker.setAccuracy(0.9F); + similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true); + assertEquals(0, similar.length); + + spellChecker = new DirectSpellChecker(); // reset defaults + spellChecker.setMinPrefix(0); + similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true); + assertEquals(1, similar.length); + + spellChecker = new DirectSpellChecker(); // reset defaults + spellChecker.setMinPrefix(1); + similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true); + assertEquals(0, similar.length); + + ir.close(); + writer.close(); + dir.close(); + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,49 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; + +public class TestJaroWinklerDistance extends LuceneTestCase { + + private StringDistance sd = new JaroWinklerDistance(); + + public void testGetDistance() { + float d = sd.getDistance("al", "al"); + assertTrue(d == 1.0f); + d = sd.getDistance("martha", "marhta"); + assertTrue(d > 0.961 && d <0.962); + d = sd.getDistance("jones", "johnson"); + assertTrue(d > 0.832 && d < 0.833); + d = sd.getDistance("abcvwxyz", "cabvwxyz"); + assertTrue(d > 0.958 && d < 0.959); + d = sd.getDistance("dwayne", "duane"); + assertTrue(d > 0.84 && d < 0.841); + d = sd.getDistance("dixon", "dicksonx"); + assertTrue(d > 0.813 && d < 0.814); + d = sd.getDistance("fvie", "ten"); + assertTrue(d == 0f); + float d1 = sd.getDistance("zac ephron", "zac efron"); + float d2 = sd.getDistance("zac ephron", "kai ephron"); + assertTrue(d1 > d2); + d1 = sd.getDistance("brittney spears", "britney spears"); + d2 = sd.getDistance("brittney spears", "brittney startzman"); + assertTrue(d1 > d2); + } + +} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,54 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; + +public class TestLevenshteinDistance extends LuceneTestCase { + + private StringDistance sd = new LevensteinDistance(); + + public void testGetDistance() { + float d = sd.getDistance("al", "al"); + assertEquals(d,1.0f,0.001); + d = sd.getDistance("martha", "marhta"); + assertEquals(d,0.6666,0.001); + d = sd.getDistance("jones", "johnson"); + assertEquals(d,0.4285,0.001); + d = sd.getDistance("abcvwxyz", "cabvwxyz"); + assertEquals(d,0.75,0.001); + d = sd.getDistance("dwayne", "duane"); + assertEquals(d,0.666,0.001); + d = sd.getDistance("dixon", "dicksonx"); + assertEquals(d,0.5,0.001); + d = sd.getDistance("six", "ten"); + assertEquals(d,0,0.001); + float d1 = sd.getDistance("zac ephron", "zac efron"); + float d2 = sd.getDistance("zac ephron", "kai ephron"); + assertEquals(d1,d2,0.001); + d1 = sd.getDistance("brittney spears", "britney spears"); + d2 = sd.getDistance("brittney spears", "brittney startzman"); + assertTrue(d1 > d2); + } + + public void testEmpty() throws Exception { + float d = sd.getDistance("", "al"); + assertEquals(d,0.0f,0.001); + } + +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,210 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Test case for LuceneDictionary. + * It first creates a simple index and then a couple of instances of LuceneDictionary + * on different fields and checks if all the right text comes back. + */ +public class TestLuceneDictionary extends LuceneTestCase { + + private Directory store; + + private IndexReader indexReader = null; + private LuceneDictionary ld; + private Iterator it; + + @Override + public void setUp() throws Exception { + super.setUp(); + store = newDirectory(); + IndexWriter writer = new IndexWriter(store, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false))); + + Document doc; + + doc = new Document(); + doc.add(newField("aaa", "foo", Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(newField("aaa", "foo", Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new Field("contents", "Tom", Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new Field("contents", "Jerry", Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(newField("zzz", "bar", Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + + writer.optimize(); + writer.close(); + } + + @Override + public void tearDown() throws Exception { + if (indexReader != null) + indexReader.close(); + store.close(); + super.tearDown(); + } + + public void testFieldNonExistent() throws IOException { + try { + indexReader = IndexReader.open(store, true); + + ld = new LuceneDictionary(indexReader, "nonexistent_field"); + it = ld.getWordsIterator(); + + assertFalse("More elements than expected", it.hasNext()); + assertTrue("Nonexistent element is really null", it.next() == null); + } finally { + if (indexReader != null) { indexReader.close(); } + } + } + + public void testFieldAaa() throws IOException { + try { + indexReader = IndexReader.open(store, true); + + ld = new LuceneDictionary(indexReader, "aaa"); + it = ld.getWordsIterator(); + + assertTrue("First element doesn't exist.", it.hasNext()); + assertTrue("First element isn't correct", it.next().equals("foo")); + assertFalse("More elements than expected", it.hasNext()); + assertTrue("Nonexistent element is really null", it.next() == null); + } finally { + if (indexReader != null) { indexReader.close(); } + } + } + + public void testFieldContents_1() throws IOException { + try { + indexReader = IndexReader.open(store, true); + + ld = new LuceneDictionary(indexReader, "contents"); + it = ld.getWordsIterator(); + + assertTrue("First element doesn't exist.", it.hasNext()); + assertTrue("First element isn't correct", it.next().equals("Jerry")); + assertTrue("Second element doesn't exist.", it.hasNext()); + assertTrue("Second element isn't correct", it.next().equals("Tom")); + assertFalse("More elements than expected", it.hasNext()); + assertTrue("Nonexistent element is really null", it.next() == null); + + ld = new LuceneDictionary(indexReader, "contents"); + it = ld.getWordsIterator(); + + int counter = 2; + while (it.hasNext()) { + it.next(); + counter--; + } + + assertTrue("Number of words incorrect", counter == 0); + } + finally { + if (indexReader != null) { indexReader.close(); } + } + } + + public void testFieldContents_2() throws IOException { + try { + indexReader = IndexReader.open(store, true); + + ld = new LuceneDictionary(indexReader, "contents"); + it = ld.getWordsIterator(); + + // hasNext() should have no side effects + assertTrue("First element isn't were it should be.", it.hasNext()); + assertTrue("First element isn't were it should be.", it.hasNext()); + assertTrue("First element isn't were it should be.", it.hasNext()); + + // just iterate through words + assertTrue("First element isn't correct", it.next().equals("Jerry")); + assertTrue("Second element isn't correct", it.next().equals("Tom")); + assertTrue("Nonexistent element is really null", it.next() == null); + + // hasNext() should still have no side effects ... + assertFalse("There should be any more elements", it.hasNext()); + assertFalse("There should be any more elements", it.hasNext()); + assertFalse("There should be any more elements", it.hasNext()); + + // .. and there are really no more words + assertTrue("Nonexistent element is really null", it.next() == null); + assertTrue("Nonexistent element is really null", it.next() == null); + assertTrue("Nonexistent element is really null", it.next() == null); + } + finally { + if (indexReader != null) { indexReader.close(); } + } + } + + public void testFieldZzz() throws IOException { + try { + indexReader = IndexReader.open(store, true); + + ld = new LuceneDictionary(indexReader, "zzz"); + it = ld.getWordsIterator(); + + assertTrue("First element doesn't exist.", it.hasNext()); + assertTrue("First element isn't correct", it.next().equals("bar")); + assertFalse("More elements than expected", it.hasNext()); + assertTrue("Nonexistent element is really null", it.next() == null); + } + finally { + if (indexReader != null) { indexReader.close(); } + } + } + + public void testSpellchecker() throws IOException { + Directory dir = newDirectory(); + SpellChecker sc = new SpellChecker(dir); + indexReader = IndexReader.open(store, true); + sc.indexDictionary(new LuceneDictionary(indexReader, "contents")); + String[] suggestions = sc.suggestSimilar("Tam", 1); + assertEquals(1, suggestions.length); + assertEquals("Tom", suggestions[0]); + suggestions = sc.suggestSimilar("Jarry", 1); + assertEquals(1, suggestions.length); + assertEquals("Jerry", suggestions[0]); + indexReader.close(); + sc.close(); + dir.close(); + } + +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestNGramDistance.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestNGramDistance.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestNGramDistance.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestNGramDistance.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,132 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; + +public class TestNGramDistance extends LuceneTestCase { + + + + public void testGetDistance1() { + StringDistance nsd = new NGramDistance(1); + float d = nsd.getDistance("al", "al"); + assertEquals(d,1.0f,0.001); + d = nsd.getDistance("a", "a"); + assertEquals(d,1.0f,0.001); + d = nsd.getDistance("b", "a"); + assertEquals(d,0.0f,0.001); + d = nsd.getDistance("martha", "marhta"); + assertEquals(d,0.6666,0.001); + d = nsd.getDistance("jones", "johnson"); + assertEquals(d,0.4285,0.001); + d = nsd.getDistance("natural", "contrary"); + assertEquals(d,0.25,0.001); + d = nsd.getDistance("abcvwxyz", "cabvwxyz"); + assertEquals(d,0.75,0.001); + d = nsd.getDistance("dwayne", "duane"); + assertEquals(d,0.666,0.001); + d = nsd.getDistance("dixon", "dicksonx"); + assertEquals(d,0.5,0.001); + d = nsd.getDistance("six", "ten"); + assertEquals(d,0,0.001); + float d1 = nsd.getDistance("zac ephron", "zac efron"); + float d2 = nsd.getDistance("zac ephron", "kai ephron"); + assertEquals(d1,d2,0.001); + d1 = nsd.getDistance("brittney spears", "britney spears"); + d2 = nsd.getDistance("brittney spears", "brittney startzman"); + assertTrue(d1 > d2); + d1 = nsd.getDistance("12345678", "12890678"); + d2 = nsd.getDistance("12345678", "72385698"); + assertEquals(d1,d2,001); + } + + public void testGetDistance2() { + StringDistance sd = new NGramDistance(2); + float d = sd.getDistance("al", "al"); + assertEquals(d,1.0f,0.001); + d = sd.getDistance("a", "a"); + assertEquals(d,1.0f,0.001); + d = sd.getDistance("b", "a"); + assertEquals(d,0.0f,0.001); + d = sd.getDistance("a", "aa"); + assertEquals(d,0.5f,0.001); + d = sd.getDistance("martha", "marhta"); + assertEquals(d,0.6666,0.001); + d = sd.getDistance("jones", "johnson"); + assertEquals(d,0.4285,0.001); + d = sd.getDistance("natural", "contrary"); + assertEquals(d,0.25,0.001); + d = sd.getDistance("abcvwxyz", "cabvwxyz"); + assertEquals(d,0.625,0.001); + d = sd.getDistance("dwayne", "duane"); + assertEquals(d,0.5833,0.001); + d = sd.getDistance("dixon", "dicksonx"); + assertEquals(d,0.5,0.001); + d = sd.getDistance("six", "ten"); + assertEquals(d,0,0.001); + float d1 = sd.getDistance("zac ephron", "zac efron"); + float d2 = sd.getDistance("zac ephron", "kai ephron"); + assertTrue(d1 > d2); + d1 = sd.getDistance("brittney spears", "britney spears"); + d2 = sd.getDistance("brittney spears", "brittney startzman"); + assertTrue(d1 > d2); + d1 = sd.getDistance("0012345678", "0012890678"); + d2 = sd.getDistance("0012345678", "0072385698"); + assertEquals(d1,d2,0.001); + } + + public void testGetDistance3() { + StringDistance sd = new NGramDistance(3); + float d = sd.getDistance("al", "al"); + assertEquals(d,1.0f,0.001); + d = sd.getDistance("a", "a"); + assertEquals(d,1.0f,0.001); + d = sd.getDistance("b", "a"); + assertEquals(d,0.0f,0.001); + d = sd.getDistance("martha", "marhta"); + assertEquals(d,0.7222,0.001); + d = sd.getDistance("jones", "johnson"); + assertEquals(d,0.4762,0.001); + d = sd.getDistance("natural", "contrary"); + assertEquals(d,0.2083,0.001); + d = sd.getDistance("abcvwxyz", "cabvwxyz"); + assertEquals(d,0.5625,0.001); + d = sd.getDistance("dwayne", "duane"); + assertEquals(d,0.5277,0.001); + d = sd.getDistance("dixon", "dicksonx"); + assertEquals(d,0.4583,0.001); + d = sd.getDistance("six", "ten"); + assertEquals(d,0,0.001); + float d1 = sd.getDistance("zac ephron", "zac efron"); + float d2 = sd.getDistance("zac ephron", "kai ephron"); + assertTrue(d1 > d2); + d1 = sd.getDistance("brittney spears", "britney spears"); + d2 = sd.getDistance("brittney spears", "brittney startzman"); + assertTrue(d1 > d2); + d1 = sd.getDistance("0012345678", "0012890678"); + d2 = sd.getDistance("0012345678", "0072385698"); + assertTrue(d1 < d2); + } + + public void testEmpty() throws Exception { + StringDistance nsd = new NGramDistance(1); + float d = nsd.getDistance("", "al"); + assertEquals(d,0.0f,0.001); + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,47 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Test case for PlainTextDictionary + * + */ +public class TestPlainTextDictionary extends LuceneTestCase { + + public void testBuild() throws IOException { + final String LF = System.getProperty("line.separator"); + String input = "oneword" + LF + "twoword" + LF + "threeword"; + PlainTextDictionary ptd = new PlainTextDictionary(new StringReader(input)); + Directory ramDir = newDirectory(); + SpellChecker spellChecker = new SpellChecker(ramDir); + spellChecker.indexDictionary(ptd); + String[] similar = spellChecker.suggestSimilar("treeword", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "threeword"); + assertEquals(similar[1], "oneword"); + spellChecker.close(); + ramDir.close(); + } + +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java 2011-05-22 16:44:12.000000000 -0400 @@ -0,0 +1,438 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Spell checker test case + */ +public class TestSpellChecker extends LuceneTestCase { + private SpellCheckerMock spellChecker; + private Directory userindex, spellindex; + private List searchers; + + @Override + public void setUp() throws Exception { + super.setUp(); + + //create a user index + userindex = newDirectory(); + IndexWriter writer = new IndexWriter(userindex, new IndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer(random))); + + for (int i = 0; i < 1000; i++) { + Document doc = new Document(); + doc.add(newField("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED)); + doc.add(newField("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand + doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand + writer.addDocument(doc); + } + writer.close(); + searchers = Collections.synchronizedList(new ArrayList()); + // create the spellChecker + spellindex = newDirectory(); + spellChecker = new SpellCheckerMock(spellindex); + } + + @Override + public void tearDown() throws Exception { + userindex.close(); + if (!spellChecker.isClosed()) + spellChecker.close(); + spellindex.close(); + super.tearDown(); + } + + + public void testBuild() throws CorruptIndexException, IOException { + IndexReader r = IndexReader.open(userindex, true); + + spellChecker.clearIndex(); + + addwords(r, spellChecker, "field1"); + int num_field1 = this.numdoc(); + + addwords(r, spellChecker, "field2"); + int num_field2 = this.numdoc(); + + assertEquals(num_field2, num_field1 + 1); + + assertLastSearcherOpen(4); + + checkCommonSuggestions(r); + checkLevenshteinSuggestions(r); + + spellChecker.setStringDistance(new JaroWinklerDistance()); + spellChecker.setAccuracy(0.8f); + checkCommonSuggestions(r); + checkJaroWinklerSuggestions(); + // the accuracy is set to 0.8 by default, but the best result has a score of 0.925 + String[] similar = spellChecker.suggestSimilar("fvie", 2, 0.93f); + assertTrue(similar.length == 0); + similar = spellChecker.suggestSimilar("fvie", 2, 0.92f); + assertTrue(similar.length == 1); + + similar = spellChecker.suggestSimilar("fiv", 2); + assertTrue(similar.length > 0); + assertEquals(similar[0], "five"); + + spellChecker.setStringDistance(new NGramDistance(2)); + spellChecker.setAccuracy(0.5f); + checkCommonSuggestions(r); + checkNGramSuggestions(); + + r.close(); + } + + public void testComparator() throws Exception { + IndexReader r = IndexReader.open(userindex, true); + Directory compIdx = newDirectory(); + SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator()); + addwords(r, compareSP, "field3"); + + String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", false); + assertTrue(similar.length == 2); + //five and fvei have the same score, but different frequencies. + assertEquals("fvei", similar[0]); + assertEquals("five", similar[1]); + r.close(); + if (!compareSP.isClosed()) + compareSP.close(); + compIdx.close(); + } + + private void checkCommonSuggestions(IndexReader r) throws IOException { + String[] similar = spellChecker.suggestSimilar("fvie", 2); + assertTrue(similar.length > 0); + assertEquals(similar[0], "five"); + + similar = spellChecker.suggestSimilar("five", 2); + if (similar.length > 0) { + assertFalse(similar[0].equals("five")); // don't suggest a word for itself + } + + similar = spellChecker.suggestSimilar("fiv", 2); + assertTrue(similar.length > 0); + assertEquals(similar[0], "five"); + + similar = spellChecker.suggestSimilar("fives", 2); + assertTrue(similar.length > 0); + assertEquals(similar[0], "five"); + + assertTrue(similar.length > 0); + similar = spellChecker.suggestSimilar("fie", 2); + assertEquals(similar[0], "five"); + + // test restraint to a field + similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false); + assertEquals(0, similar.length); // there isn't the term thousand in the field field1 + + similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false); + assertEquals(1, similar.length); // there is the term thousand in the field field2 + } + + private void checkLevenshteinSuggestions(IndexReader r) throws IOException { + // test small word + String[] similar = spellChecker.suggestSimilar("fvie", 2); + assertEquals(1, similar.length); + assertEquals(similar[0], "five"); + + similar = spellChecker.suggestSimilar("five", 2); + assertEquals(1, similar.length); + assertEquals(similar[0], "nine"); // don't suggest a word for itself + + similar = spellChecker.suggestSimilar("fiv", 2); + assertEquals(1, similar.length); + assertEquals(similar[0], "five"); + + similar = spellChecker.suggestSimilar("ive", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "five"); + assertEquals(similar[1], "nine"); + + similar = spellChecker.suggestSimilar("fives", 2); + assertEquals(1, similar.length); + assertEquals(similar[0], "five"); + + similar = spellChecker.suggestSimilar("fie", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "five"); + assertEquals(similar[1], "nine"); + + similar = spellChecker.suggestSimilar("fi", 2); + assertEquals(1, similar.length); + assertEquals(similar[0], "five"); + + // test restraint to a field + similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false); + assertEquals(0, similar.length); // there isn't the term thousand in the field field1 + + similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false); + assertEquals(1, similar.length); // there is the term thousand in the field field2 + + similar = spellChecker.suggestSimilar("onety", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "ninety"); + assertEquals(similar[1], "one"); + try { + similar = spellChecker.suggestSimilar("tousand", 10, r, null, false); + } catch (NullPointerException e) { + assertTrue("threw an NPE, and it shouldn't have", false); + } + } + + private void checkJaroWinklerSuggestions() throws IOException { + String[] similar = spellChecker.suggestSimilar("onety", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "one"); + assertEquals(similar[1], "ninety"); + } + + private void checkNGramSuggestions() throws IOException { + String[] similar = spellChecker.suggestSimilar("onety", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "one"); + assertEquals(similar[1], "ninety"); + } + + private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException { + long time = System.currentTimeMillis(); + sc.indexDictionary(new LuceneDictionary(r, field)); + time = System.currentTimeMillis() - time; + //System.out.println("time to build " + field + ": " + time); + } + + private int numdoc() throws IOException { + IndexReader rs = IndexReader.open(spellindex, true); + int num = rs.numDocs(); + assertTrue(num != 0); + //System.out.println("num docs: " + num); + rs.close(); + return num; + } + + public void testClose() throws IOException { + IndexReader r = IndexReader.open(userindex, true); + spellChecker.clearIndex(); + String field = "field1"; + addwords(r, spellChecker, "field1"); + int num_field1 = this.numdoc(); + addwords(r, spellChecker, "field2"); + int num_field2 = this.numdoc(); + assertEquals(num_field2, num_field1 + 1); + checkCommonSuggestions(r); + assertLastSearcherOpen(4); + spellChecker.close(); + assertSearchersClosed(); + try { + spellChecker.close(); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + try { + checkCommonSuggestions(r); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + + try { + spellChecker.clearIndex(); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + + try { + spellChecker.indexDictionary(new LuceneDictionary(r, field)); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + + try { + spellChecker.setSpellIndex(spellindex); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + assertEquals(4, searchers.size()); + assertSearchersClosed(); + r.close(); + } + + /* + * tests if the internally shared indexsearcher is correctly closed + * when the spellchecker is concurrently accessed and closed. + */ + public void testConcurrentAccess() throws IOException, InterruptedException { + assertEquals(1, searchers.size()); + final IndexReader r = IndexReader.open(userindex, true); + spellChecker.clearIndex(); + assertEquals(2, searchers.size()); + addwords(r, spellChecker, "field1"); + assertEquals(3, searchers.size()); + int num_field1 = this.numdoc(); + addwords(r, spellChecker, "field2"); + assertEquals(4, searchers.size()); + int num_field2 = this.numdoc(); + assertEquals(num_field2, num_field1 + 1); + int numThreads = 5 + this.random.nextInt(5); + ExecutorService executor = Executors.newFixedThreadPool(numThreads); + SpellCheckWorker[] workers = new SpellCheckWorker[numThreads]; + for (int i = 0; i < numThreads; i++) { + SpellCheckWorker spellCheckWorker = new SpellCheckWorker(r); + executor.execute(spellCheckWorker); + workers[i] = spellCheckWorker; + + } + int iterations = 5 + random.nextInt(5); + for (int i = 0; i < iterations; i++) { + Thread.sleep(100); + // concurrently reset the spell index + spellChecker.setSpellIndex(this.spellindex); + // for debug - prints the internal open searchers + // showSearchersOpen(); + } + + spellChecker.close(); + executor.shutdown(); + // wait for 60 seconds - usually this is very fast but coverage runs could take quite long + executor.awaitTermination(60L, TimeUnit.SECONDS); + + for (int i = 0; i < workers.length; i++) { + assertFalse(String.format("worker thread %d failed", i), workers[i].failed); + assertTrue(String.format("worker thread %d is still running but should be terminated", i), workers[i].terminated); + } + // 4 searchers more than iterations + // 1. at creation + // 2. clearIndex() + // 2. and 3. during addwords + assertEquals(iterations + 4, searchers.size()); + assertSearchersClosed(); + r.close(); + } + + private void assertLastSearcherOpen(int numSearchers) { + assertEquals(numSearchers, searchers.size()); + IndexSearcher[] searcherArray = searchers.toArray(new IndexSearcher[0]); + for (int i = 0; i < searcherArray.length; i++) { + if (i == searcherArray.length - 1) { + assertTrue("expected last searcher open but was closed", + searcherArray[i].getIndexReader().getRefCount() > 0); + } else { + assertFalse("expected closed searcher but was open - Index: " + i, + searcherArray[i].getIndexReader().getRefCount() > 0); + } + } + } + + private void assertSearchersClosed() { + for (IndexSearcher searcher : searchers) { + assertEquals(0, searcher.getIndexReader().getRefCount()); + } + } + + // For debug +// private void showSearchersOpen() { +// int count = 0; +// for (IndexSearcher searcher : searchers) { +// if(searcher.getIndexReader().getRefCount() > 0) +// ++count; +// } +// System.out.println(count); +// } + + + private class SpellCheckWorker implements Runnable { + private final IndexReader reader; + volatile boolean terminated = false; + volatile boolean failed = false; + + SpellCheckWorker(IndexReader reader) { + super(); + this.reader = reader; + } + + public void run() { + try { + while (true) { + try { + checkCommonSuggestions(reader); + } catch (AlreadyClosedException e) { + + return; + } catch (Throwable e) { + + e.printStackTrace(); + failed = true; + return; + } + } + } finally { + terminated = true; + } + } + + } + + class SpellCheckerMock extends SpellChecker { + public SpellCheckerMock(Directory spellIndex) throws IOException { + super(spellIndex); + } + + public SpellCheckerMock(Directory spellIndex, StringDistance sd) + throws IOException { + super(spellIndex, sd); + } + + public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException { + super(spellIndex, sd, comparator); + } + + @Override + IndexSearcher createSearcher(Directory dir) throws IOException { + IndexSearcher searcher = super.createSearcher(dir); + TestSpellChecker.this.searchers.add(searcher); + return searcher; + } + } + +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/Average.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/Average.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/Average.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/Average.java 2011-05-22 18:51:32.000000000 -0400 @@ -0,0 +1,70 @@ +package org.apache.lucene.search.suggest; + + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.List; +import java.util.Locale; + +/** + * Average with standard deviation. + */ +final class Average +{ + /** + * Average (in milliseconds). + */ + public final double avg; + + /** + * Standard deviation (in milliseconds). + */ + public final double stddev; + + /** + * + */ + Average(double avg, double stddev) + { + this.avg = avg; + this.stddev = stddev; + } + + public String toString() + { + return String.format(Locale.ENGLISH, "%.0f [+- %.2f]", + avg, stddev); + } + + static Average from(List values) + { + double sum = 0; + double sumSquares = 0; + + for (double l : values) + { + sum += l; + sumSquares += l * l; + } + + double avg = sum / (double) values.size(); + return new Average( + (sum / (double) values.size()), + Math.sqrt(sumSquares / (double) values.size() - avg * avg)); + } +} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java 2011-05-22 19:05:43.000000000 -0400 @@ -0,0 +1,246 @@ +package org.apache.lucene.search.suggest; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.Random; +import java.util.concurrent.Callable; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.fst.FSTLookup; +import org.apache.lucene.search.suggest.jaspell.JaspellLookup; +import org.apache.lucene.search.suggest.tst.TSTLookup; + +import org.junit.BeforeClass; +import org.junit.Ignore; + +/** + * Benchmarks tests for implementations of {@link Lookup} interface. + */ +@Ignore("COMMENT ME TO RUN BENCHMARKS!") +public class LookupBenchmarkTest extends LuceneTestCase { + @SuppressWarnings("unchecked") + private final List> benchmarkClasses = Arrays.asList( + JaspellLookup.class, + TSTLookup.class, + FSTLookup.class); + + private final static int rounds = 15; + private final static int warmup = 5; + + private final int num = 7; + private final boolean onlyMorePopular = true; + + private final static Random random = new Random(0xdeadbeef); + + /** + * Input term/weight pairs. + */ + private static TermFreq [] dictionaryInput; + + /** + * Benchmark term/weight pairs (randomized order). + */ + private static List benchmarkInput; + + /** + * Loads terms and frequencies from Wikipedia (cached). + */ + @BeforeClass + public static void setup() throws Exception { + List input = readTop50KWiki(); + Collections.shuffle(input, random); + LookupBenchmarkTest.dictionaryInput = input.toArray(new TermFreq [input.size()]); + Collections.shuffle(input, random); + LookupBenchmarkTest.benchmarkInput = input; + } + + static final Charset UTF_8 = Charset.forName("UTF-8"); + + /** + * Collect the multilingual input for benchmarks/ tests. + */ + public static List readTop50KWiki() throws Exception { + List input = new ArrayList(); + URL resource = LookupBenchmarkTest.class.getResource("Top50KWiki.utf8"); + assert resource != null : "Resource missing: Top50KWiki.utf8"; + + String line = null; + BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(), UTF_8)); + while ((line = br.readLine()) != null) { + int tab = line.indexOf('|'); + assertTrue("No | separator?: " + line, tab >= 0); + float weight = Float.parseFloat(line.substring(tab + 1)); + String key = line.substring(0, tab); + input.add(new TermFreq(key, weight)); + } + br.close(); + return input; + } + + /** + * Test construction time. + */ + public void testConstructionTime() throws Exception { + System.err.println("-- construction time"); + for (final Class cls : benchmarkClasses) { + BenchmarkResult result = measure(new Callable() { + public Integer call() throws Exception { + final Lookup lookup = buildLookup(cls, dictionaryInput); + return lookup.hashCode(); + } + }); + + System.err.println( + String.format(Locale.ENGLISH, "%-15s input: %d, time[ms]: %s", + cls.getSimpleName(), + dictionaryInput.length, + result.average.toString())); + } + } + + /** + * Test memory required for the storage. + */ + public void testStorageNeeds() throws Exception { + System.err.println("-- RAM consumption"); + final RamUsageEstimator rue = new RamUsageEstimator(); + for (Class cls : benchmarkClasses) { + Lookup lookup = buildLookup(cls, dictionaryInput); + System.err.println( + String.format(Locale.ENGLISH, "%-15s size[B]:%,13d", + lookup.getClass().getSimpleName(), + rue.estimateRamUsage(lookup))); + } + } + + /** + * Create {@link Lookup} instance and populate it. + */ + private Lookup buildLookup(Class cls, TermFreq[] input) throws Exception { + Lookup lookup = cls.newInstance(); + lookup.build(new TermFreqArrayIterator(input)); + return lookup; + } + + /** + * Test performance of lookup on full hits. + */ + public void testPerformanceOnFullHits() throws Exception { + final int minPrefixLen = 100; + final int maxPrefixLen = 200; + runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular); + } + + /** + * Test performance of lookup on longer term prefixes (6-9 letters or shorter). + */ + public void testPerformanceOnPrefixes6_9() throws Exception { + final int minPrefixLen = 6; + final int maxPrefixLen = 9; + runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular); + } + + /** + * Test performance of lookup on short term prefixes (2-4 letters or shorter). + */ + public void testPerformanceOnPrefixes2_4() throws Exception { + final int minPrefixLen = 2; + final int maxPrefixLen = 4; + runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular); + } + + /** + * Run the actual benchmark. + */ + public void runPerformanceTest(final int minPrefixLen, final int maxPrefixLen, + final int num, final boolean onlyMorePopular) throws Exception { + System.err.println(String.format(Locale.ENGLISH, + "-- prefixes: %d-%d, num: %d, onlyMorePopular: %s", + minPrefixLen, maxPrefixLen, num, onlyMorePopular)); + + for (Class cls : benchmarkClasses) { + final Lookup lookup = buildLookup(cls, dictionaryInput); + + final List input = new ArrayList(benchmarkInput.size()); + for (TermFreq tf : benchmarkInput) { + input.add(tf.term.substring(0, Math.min(tf.term.length(), + minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)))); + } + + BenchmarkResult result = measure(new Callable() { + public Integer call() throws Exception { + int v = 0; + for (String term : input) { + v += lookup.lookup(term, onlyMorePopular, num).size(); + } + return v; + } + }); + + System.err.println( + String.format(Locale.ENGLISH, "%-15s queries: %d, time[ms]: %s, ~qps: %.0f", + lookup.getClass().getSimpleName(), + input.size(), + result.average.toString(), + input.size() / result.average.avg)); + } + } + + /** + * Do the measurements. + */ + private BenchmarkResult measure(Callable callable) { + final double NANOS_PER_MS = 1000000; + + try { + List times = new ArrayList(); + for (int i = 0; i < warmup + rounds; i++) { + final long start = System.nanoTime(); + guard = callable.call().intValue(); + times.add((System.nanoTime() - start) / NANOS_PER_MS); + } + return new BenchmarkResult(times, warmup, rounds); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** Guard against opts. */ + @SuppressWarnings("unused") + private static volatile int guard; + + private static class BenchmarkResult { + /** Average time per round (ms). */ + public final Average average; + + public BenchmarkResult(List times, int warmup, int rounds) { + this.average = Average.from(times.subList(warmup, times.size())); + } + } +} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java 2011-05-22 18:50:16.000000000 -0400 @@ -0,0 +1,89 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.suggest; + +import java.io.File; + +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.fst.FSTLookup; +import org.apache.lucene.search.suggest.jaspell.JaspellLookup; +import org.apache.lucene.search.suggest.tst.TSTLookup; +import org.apache.lucene.util.LuceneTestCase; + +public class PersistenceTest extends LuceneTestCase { + public final String[] keys = new String[] { + "one", + "two", + "three", + "four", + "oneness", + "onerous", + "onesimus", + "twofold", + "twonk", + "thrive", + "through", + "threat", + "foundation", + "fourier", + "fourty"}; + + public void testTSTPersistence() throws Exception { + runTest(TSTLookup.class, true); + } + + public void testJaspellPersistence() throws Exception { + runTest(JaspellLookup.class, true); + } + + public void testFSTPersistence() throws Exception { + runTest(FSTLookup.class, false); + } + + private void runTest(Class lookupClass, + boolean supportsExactWeights) throws Exception { + + // Add all input keys. + Lookup lookup = lookupClass.newInstance(); + TermFreq[] keys = new TermFreq[this.keys.length]; + for (int i = 0; i < keys.length; i++) + keys[i] = new TermFreq(this.keys[i], (float) i); + lookup.build(new TermFreqArrayIterator(keys)); + + // Store the suggester. + File storeDir = TEMP_DIR; + lookup.store(storeDir); + + // Re-read it from disk. + lookup = lookupClass.newInstance(); + lookup.load(storeDir); + + // Assert validity. + float previous = Float.NEGATIVE_INFINITY; + for (TermFreq k : keys) { + Float val = (Float) lookup.get(k.term); + assertNotNull(k.term, val); + + if (supportsExactWeights) { + assertEquals(k.term, Float.valueOf(k.v), val); + } else { + assertTrue(val + ">=" + previous, val >= previous); + previous = val.floatValue(); + } + } + } +} diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java 2011-05-22 18:52:02.000000000 -0400 @@ -0,0 +1,28 @@ +package org.apache.lucene.search.suggest; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public final class TermFreq { + public final String term; + public final float v; + + public TermFreq(String term, float v) { + this.term = term; + this.v = v; + } +} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java 2011-05-22 18:52:09.000000000 -0400 @@ -0,0 +1,57 @@ +package org.apache.lucene.search.suggest; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; +import java.util.Iterator; + +import org.apache.lucene.search.spell.TermFreqIterator; + +/** + * A {@link TermFreqIterator} over a sequence of {@link TermFreq}s. + */ +public final class TermFreqArrayIterator implements TermFreqIterator { + private final Iterator i; + private TermFreq current; + + public TermFreqArrayIterator(Iterator i) { + this.i = i; + } + + public TermFreqArrayIterator(TermFreq [] i) { + this(Arrays.asList(i)); + } + + public TermFreqArrayIterator(Iterable i) { + this(i.iterator()); + } + + public float freq() { + return current.v; + } + + public boolean hasNext() { + return i.hasNext(); + } + + public String next() { + return (current = i.next()).term; + } + + public void remove() { throw new UnsupportedOperationException(); } +} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java --- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java 2011-05-22 18:52:18.000000000 -0400 @@ -0,0 +1,162 @@ +package org.apache.lucene.search.suggest.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Random; + +import org.apache.lucene.search.suggest.Lookup.LookupResult; +import org.apache.lucene.search.suggest.fst.FSTLookup; +import org.apache.lucene.util.LuceneTestCase; + +import org.apache.lucene.search.suggest.LookupBenchmarkTest; +import org.apache.lucene.search.suggest.TermFreq; +import org.apache.lucene.search.suggest.TermFreqArrayIterator; + +/** + * Unit tests for {@link FSTLookup}. + */ +public class FSTLookupTest extends LuceneTestCase { + public static TermFreq tf(String t, float v) { + return new TermFreq(t, v); + } + + private FSTLookup lookup; + + public void setUp() throws Exception { + super.setUp(); + final TermFreq[] keys = new TermFreq[] { + tf("one", 0.5f), + tf("oneness", 1), + tf("onerous", 1), + tf("onesimus", 1), + tf("two", 1), + tf("twofold", 1), + tf("twonk", 1), + tf("thrive", 1), + tf("through", 1), + tf("threat", 1), + tf("three", 1), + tf("foundation", 1), + tf("fourier", 1), + tf("four", 1), + tf("fourty", 1), + tf("xo", 1), + }; + + lookup = new FSTLookup(); + lookup.build(new TermFreqArrayIterator(keys)); + } + + public void testExactMatchHighPriority() throws Exception { + assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0"); + } + + public void testExactMatchLowPriority() throws Exception { + assertMatchEquals(lookup.lookup("one", true, 2), + "one/0.0", + "oneness/1.0"); + } + + public void testMiss() throws Exception { + assertMatchEquals(lookup.lookup("xyz", true, 1)); + } + + public void testAlphabeticWithWeights() throws Exception { + assertEquals(0, lookup.lookup("xyz", false, 1).size()); + } + + public void testFullMatchList() throws Exception { + assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE), + "oneness/1.0", + "onerous/1.0", + "onesimus/1.0", + "one/0.0"); + } + + public void testMultilingualInput() throws Exception { + List input = LookupBenchmarkTest.readTop50KWiki(); + + lookup = new FSTLookup(); + lookup.build(new TermFreqArrayIterator(input)); + + for (TermFreq tf : input) { + assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null); + assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key); + } + } + + public void testEmptyInput() throws Exception { + lookup = new FSTLookup(); + lookup.build(new TermFreqArrayIterator(new TermFreq[0])); + + assertMatchEquals(lookup.lookup("", true, 10)); + } + + public void testRandom() throws Exception { + List freqs = new ArrayList(); + Random rnd = random; + for (int i = 0; i < 5000; i++) { + freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100))); + } + lookup = new FSTLookup(); + lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()]))); + + for (TermFreq tf : freqs) { + final String term = tf.term; + for (int i = 1; i < term.length(); i++) { + String prefix = term.substring(0, i); + for (LookupResult lr : lookup.lookup(prefix, true, 10)) { + assertTrue(lr.key.startsWith(prefix)); + } + } + } + } + + private void assertMatchEquals(List res, String... expected) { + String [] result = new String [res.size()]; + for (int i = 0; i < res.size(); i++) + result[i] = res.get(i).toString(); + + if (!Arrays.equals(expected, result)) { + int colLen = Math.max(maxLen(expected), maxLen(result)); + + StringBuilder b = new StringBuilder(); + String format = "%" + colLen + "s " + "%" + colLen + "s\n"; + b.append(String.format(Locale.ENGLISH, format, "Expected", "Result")); + for (int i = 0; i < Math.max(result.length, expected.length); i++) { + b.append(String.format(Locale.ENGLISH, format, + i < expected.length ? expected[i] : "--", + i < result.length ? result[i] : "--")); + } + + System.err.println(b.toString()); + fail("Expected different output:\n" + b.toString()); + } + } + + private int maxLen(String[] result) { + int len = 0; + for (String s : result) + len = Math.max(len, s.length()); + return len; + } +} diff -ruN -x .svn -x build lucene-clean-trunk/solr/common-build.xml lucene-trunk/solr/common-build.xml --- lucene-clean-trunk/solr/common-build.xml 2011-05-22 12:37:58.000000000 -0400 +++ lucene-trunk/solr/common-build.xml 2011-05-22 18:56:05.000000000 -0400 @@ -188,12 +188,12 @@ + - @@ -204,12 +204,12 @@ + - @@ -226,6 +226,9 @@ + + + @@ -241,9 +244,6 @@ - - - @@ -252,12 +252,12 @@ + - diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java lucene-trunk/solr/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java 2011-05-22 17:34:19.000000000 -0400 @@ -26,12 +26,12 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.search.spell.HighFrequencyDictionary; import org.apache.lucene.search.spell.PlainTextDictionary; import org.apache.lucene.store.RAMDirectory; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; import org.apache.solr.schema.FieldType; -import org.apache.solr.util.HighFrequencyDictionary; import org.apache.solr.search.SolrIndexSearcher; /** diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/IndexBasedSpellChecker.java lucene-trunk/solr/src/java/org/apache/solr/spelling/IndexBasedSpellChecker.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/IndexBasedSpellChecker.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/IndexBasedSpellChecker.java 2011-05-22 17:34:45.000000000 -0400 @@ -18,10 +18,11 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.search.spell.HighFrequencyDictionary; + import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; import org.apache.solr.search.SolrIndexSearcher; -import org.apache.solr.util.HighFrequencyDictionary; import java.io.File; import java.io.IOException; diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/BufferingTermFreqIteratorWrapper.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/BufferingTermFreqIteratorWrapper.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/BufferingTermFreqIteratorWrapper.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/BufferingTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,65 +0,0 @@ -package org.apache.solr.spelling.suggest; - - -import java.util.ArrayList; -import java.util.List; - -import org.apache.solr.util.TermFreqIterator; - -/** - * This wrapper buffers incoming elements. - */ -public class BufferingTermFreqIteratorWrapper implements TermFreqIterator { - - /** Entry in the buffer. */ - public static final class Entry implements Comparable { - String word; - float freq; - - public Entry(String word, float freq) { - this.word = word; - this.freq = freq; - } - - public int compareTo(Entry o) { - return word.compareTo(o.word); - } - } - - protected ArrayList entries = new ArrayList(); - - protected int curPos; - protected Entry curEntry; - - public BufferingTermFreqIteratorWrapper(TermFreqIterator source) { - // read all source data into buffer - while (source.hasNext()) { - String w = source.next(); - Entry e = new Entry(w, source.freq()); - entries.add(e); - } - curPos = 0; - } - - public float freq() { - return curEntry.freq; - } - - public boolean hasNext() { - return curPos < entries.size(); - } - - public String next() { - curEntry = entries.get(curPos); - curPos++; - return curEntry.word; - } - - public void remove() { - throw new UnsupportedOperationException("remove is not supported"); - } - - public List entries() { - return entries; - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/FileDictionary.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/FileDictionary.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/FileDictionary.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/FileDictionary.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,95 +0,0 @@ -package org.apache.solr.spelling.suggest; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import java.io.*; - -import org.apache.lucene.search.spell.Dictionary; -import org.apache.solr.util.TermFreqIterator; - - -/** - * Dictionary represented by a text file. - * - *

Format allowed: 1 string per line, optionally with a tab-separated integer value:
- * word1 TAB 100
- * word2 word3 TAB 101
- * word4 word5 TAB 102
- */ -public class FileDictionary implements Dictionary { - - private BufferedReader in; - private String line; - private boolean hasNextCalled; - - public FileDictionary(InputStream dictFile) { - in = new BufferedReader(new InputStreamReader(dictFile)); - } - - /** - * Creates a dictionary based on a reader. - */ - public FileDictionary(Reader reader) { - in = new BufferedReader(reader); - } - - public TermFreqIterator getWordsIterator() { - return new fileIterator(); - } - - final class fileIterator implements TermFreqIterator { - private float curFreq; - - public String next() { - if (!hasNextCalled) { - hasNext(); - } - hasNextCalled = false; - return line; - } - - public float freq() { - return curFreq; - } - - public boolean hasNext() { - hasNextCalled = true; - try { - line = in.readLine(); - if (line != null) { - String[] fields = line.split("\t"); - if (fields.length > 1) { - curFreq = Float.parseFloat(fields[1]); - line = fields[0]; - } else { - curFreq = 1; - } - } - } catch (IOException ex) { - throw new RuntimeException(ex); - } - return (line != null) ? true : false; - } - - public void remove() { - throw new UnsupportedOperationException(); - } - } - -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/Lookup.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/Lookup.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/Lookup.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/Lookup.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,122 +0,0 @@ -package org.apache.solr.spelling.suggest; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; - -import org.apache.lucene.search.spell.Dictionary; -import org.apache.lucene.util.PriorityQueue; -import org.apache.solr.common.util.NamedList; -import org.apache.solr.core.SolrCore; -import org.apache.solr.util.TermFreqIterator; - -public abstract class Lookup { - /** - * Result of a lookup. - */ - public static final class LookupResult implements Comparable { - public final String key; - public final float value; - - public LookupResult(String key, float value) { - this.key = key; - this.value = value; - } - - @Override - public String toString() { - return key + "/" + value; - } - - /** Compare alphabetically. */ - public int compareTo(LookupResult o) { - return this.key.compareTo(o.key); - } - } - - public static final class LookupPriorityQueue extends PriorityQueue { - - public LookupPriorityQueue(int size) { - super(size); - } - - @Override - protected boolean lessThan(LookupResult a, LookupResult b) { - return a.value < b.value; - } - - public LookupResult[] getResults() { - int size = size(); - LookupResult[] res = new LookupResult[size]; - for (int i = size - 1; i >= 0; i--) { - res[i] = pop(); - } - return res; - } - } - - /** Initialize the lookup. */ - public abstract void init(NamedList config, SolrCore core); - - /** Build lookup from a dictionary. Some implementations may require sorted - * or unsorted keys from the dictionary's iterator - use - * {@link SortedTermFreqIteratorWrapper} or - * {@link UnsortedTermFreqIteratorWrapper} in such case. - */ - public void build(Dictionary dict) throws IOException { - Iterator it = dict.getWordsIterator(); - TermFreqIterator tfit; - if (it instanceof TermFreqIterator) { - tfit = (TermFreqIterator)it; - } else { - tfit = new TermFreqIterator.TermFreqIteratorWrapper(it); - } - build(tfit); - } - - protected abstract void build(TermFreqIterator tfit) throws IOException; - - /** - * Persist the constructed lookup data to a directory. Optional operation. - * @param storeDir directory where data can be stored. - * @return true if successful, false if unsuccessful or not supported. - * @throws IOException when fatal IO error occurs. - */ - public abstract boolean store(File storeDir) throws IOException; - - /** - * Discard current lookup data and load it from a previously saved copy. - * Optional operation. - * @param storeDir directory where lookup data was stored. - * @return true if completed successfully, false if unsuccessful or not supported. - * @throws IOException when fatal IO error occurs. - */ - public abstract boolean load(File storeDir) throws IOException; - - /** - * Look up a key and return possible completion for this key. - * @param key lookup key. Depending on the implementation this may be - * a prefix, misspelling, or even infix. - * @param onlyMorePopular return only more popular results - * @param num maximum number of results to return - * @return a list of possible completions, with their relative weight (e.g. popularity) - */ - public abstract List lookup(String key, boolean onlyMorePopular, int num); - - /** - * Modify the lookup data by recording additional data. Optional operation. - * @param key new lookup key - * @param value value to associate with this key - * @return true if new key is added, false if it already exists or operation - * is not supported. - */ - public abstract boolean add(String key, Object value); - - /** - * Get value associated with a specific key. - * @param key lookup key - * @return associated value - */ - public abstract Object get(String key); -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/LookupFactory.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/LookupFactory.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/LookupFactory.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/LookupFactory.java 2011-05-22 17:53:03.000000000 -0400 @@ -0,0 +1,29 @@ +package org.apache.solr.spelling.suggest; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.suggest.Lookup; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; + +/** + * Suggester factory for creating {@link Lookup} instances. + */ +public abstract class LookupFactory { + public abstract Lookup create(NamedList params, SolrCore core); +} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/SortedTermFreqIteratorWrapper.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/SortedTermFreqIteratorWrapper.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/SortedTermFreqIteratorWrapper.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/SortedTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,18 +0,0 @@ -package org.apache.solr.spelling.suggest; - -import java.util.Collections; - -import org.apache.solr.util.SortedIterator; -import org.apache.solr.util.TermFreqIterator; - -/** - * This wrapper buffers incoming elements and makes sure they are sorted in - * ascending lexicographic order. - */ -public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper implements SortedIterator { - - public SortedTermFreqIteratorWrapper(TermFreqIterator source) { - super(source); - Collections.sort(entries); - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/Suggester.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/Suggester.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/Suggester.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/Suggester.java 2011-05-22 19:16:54.000000000 -0400 @@ -27,15 +27,20 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.spell.Dictionary; +import org.apache.lucene.search.spell.HighFrequencyDictionary; +import org.apache.lucene.search.suggest.FileDictionary; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.Lookup.LookupResult; + import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.spelling.SolrSpellChecker; import org.apache.solr.spelling.SpellingOptions; import org.apache.solr.spelling.SpellingResult; -import org.apache.solr.spelling.suggest.Lookup.LookupResult; -import org.apache.solr.spelling.suggest.jaspell.JaspellLookup; -import org.apache.solr.util.HighFrequencyDictionary; +import org.apache.solr.spelling.suggest.fst.FSTLookupFactory; +import org.apache.solr.spelling.suggest.jaspell.JaspellLookupFactory; +import org.apache.solr.spelling.suggest.tst.TSTLookupFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -80,11 +85,18 @@ sourceLocation = (String) config.get(LOCATION); field = (String)config.get(FIELD); lookupImpl = (String)config.get(LOOKUP_IMPL); - if (lookupImpl == null) { - lookupImpl = JaspellLookup.class.getName(); + + // support the old classnames without -Factory for config file backwards compatibility. + if (lookupImpl == null || "org.apache.solr.spelling.suggest.jaspell.JaspellLookup".equals(lookupImpl)) { + lookupImpl = JaspellLookupFactory.class.getName(); + } else if ("org.apache.solr.spelling.suggest.tst.TSTLookup".equals(lookupImpl)) { + lookupImpl = TSTLookupFactory.class.getName(); + } else if ("org.apache.solr.spelling.suggest.fst.FSTLookup".equals(lookupImpl)) { + lookupImpl = FSTLookupFactory.class.getName(); } - lookup = (Lookup) core.getResourceLoader().newInstance(lookupImpl); - lookup.init(config, core); + + LookupFactory factory = (LookupFactory) core.getResourceLoader().newInstance(lookupImpl); + lookup = factory.create(config, core); String store = (String)config.get(STORE_DIR); if (store != null) { storeDir = new File(store); diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/UnsortedTermFreqIteratorWrapper.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/UnsortedTermFreqIteratorWrapper.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/UnsortedTermFreqIteratorWrapper.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/UnsortedTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,17 +0,0 @@ -package org.apache.solr.spelling.suggest; - -import java.util.Collections; - -import org.apache.solr.util.TermFreqIterator; - -/** - * This wrapper buffers the incoming elements and makes sure they are in - * random order. - */ -public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper { - - public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) { - super(source); - Collections.shuffle(entries); - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookup.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookup.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookup.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookup.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,556 +0,0 @@ -package org.apache.solr.spelling.suggest.fst; - -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; - -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.automaton.fst.Builder; -import org.apache.lucene.util.automaton.fst.FST; -import org.apache.lucene.util.automaton.fst.FST.Arc; -import org.apache.lucene.util.automaton.fst.NoOutputs; -import org.apache.lucene.util.automaton.fst.Outputs; -import org.apache.solr.common.util.NamedList; -import org.apache.solr.core.SolrCore; -import org.apache.solr.spelling.suggest.Lookup; -import org.apache.solr.spelling.suggest.tst.TSTLookup; -import org.apache.solr.util.TermFreqIterator; - -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; - -/** - * Finite state automata based implementation of {@link Lookup} query - * suggestion/ autocomplete interface. - * - *

Implementation details

- * - *

The construction step in {@link #build(TermFreqIterator)} works as follows: - *

    - *
  • A set of input terms (String) and weights (float) is given.
  • - *
  • The range of weights is determined and then all weights are discretized into a fixed set - * of values ({@link #buckets}). - * Note that this means that minor changes in weights may be lost during automaton construction. - * In general, this is not a big problem because the "priorities" of completions can be split - * into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal). - * If you need exact, fine-grained weights, use {@link TSTLookup} instead.
  • - *
  • All terms in the input are preprended with a synthetic pseudo-character being the weight - * of that term. For example a term abc with a discretized weight equal '1' would - * become 1abc.
  • - *
  • The terms are sorted by their raw value of utf16 character values (including the synthetic - * term in front).
  • - *
  • A finite state automaton ({@link FST}) is constructed from the input. The root node has - * arcs labeled with all possible weights. We cache all these arcs, highest-weight first.
  • - *
- * - *

At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows: - *

    - *
  • For each possible term weight encoded in the automaton (cached arcs from the root above), - * starting with the highest one, we descend along the path of the input key. If the key is not - * a prefix of a sequence in the automaton (path ends prematurely), we exit immediately. - * No completions. - *
  • Otherwise, we have found an internal automaton node that ends the key. The entire - * subautomaton (all paths) starting from this node form the key's completions. We start - * the traversal of this subautomaton. Every time we reach a final state (arc), we add a single - * suggestion to the list of results (the weight of this suggestion is constant and equal to the - * root path we started from). The tricky part is that because automaton edges are sorted and - * we scan depth-first, we can terminate the entire procedure as soon as we collect enough - * suggestions the user requested. - *
  • In case the number of suggestions collected in the step above is still insufficient, - * we proceed to the next (smaller) weight leaving the root node and repeat the same - * algorithm again. - *
  • - *
- * - *

Runtime behavior and performance characteristic

- * - *

The algorithm described above is optimized for finding suggestions to short prefixes - * in a top-weights-first order. This is probably the most common use case: it allows - * presenting suggestions early and sorts them by the global frequency (and then alphabetically). - * - *

If there is an exact match in the automaton, it is returned first on the results - * list (even with by-weight sorting). - * - *

Note that the maximum lookup time for any prefix - * is the time of descending to the subtree, plus traversal of the subtree up to the number - * of requested suggestions (because they are already presorted by weight on the root level - * and alphabetically at any node level). - * - *

To order alphabetically only (no ordering by priorities), use identical term weights - * for all terms. Alphabetical suggestions are returned even if non-constant weights are - * used, but the algorithm for doing this is suboptimal. - * - *

"alphabetically" in any of the documentation above indicates utf16 codepoint order, - * nothing else. - */ -public class FSTLookup extends Lookup { - /** A structure for a single entry (for sorting/ preprocessing). */ - private static class Entry { - char [] term; - float weight; - - public Entry(char [] term, float freq) { - this.term = term; - this.weight = freq; - } - } - - /** - * The number of separate buckets for weights (discretization). The more buckets, - * the more fine-grained term weights (priorities) can be assigned. The speed of lookup - * will not decrease for prefixes which have highly-weighted completions (because these - * are filled-in first), but will decrease significantly for low-weighted terms (but - * these should be infrequent, so it is all right). - * - *

The number of buckets must be within [1, 255] range. - */ - public static final String WEIGHT_BUCKETS = "weightBuckets"; - - /** - * If true, exact suggestions are returned first, even if they are prefixes - * of other strings in the automaton (possibly with larger weights). - */ - public static final String EXACT_MATCH_FIRST = "exactMatchFirst"; - - /** Serialized automaton file name (storage). */ - public static final String FILENAME = "fst.dat"; - - /** An empty result. */ - private static final List EMPTY_RESULT = Lists.newArrayList(); - - /** - * @see #WEIGHT_BUCKETS - */ - private int buckets = 10; - - /** - * #see #EXACT_MATCH_FIRST - */ - private boolean exactMatchFirst = true; - - /** - * Finite state automaton encoding all the lookup terms. See class - * notes for details. - */ - private FST automaton; - - /** - * An array of arcs leaving the root automaton state and encoding weights of all - * completions in their sub-trees. - */ - private Arc [] rootArcs; - - /* */ - @Override - @SuppressWarnings("rawtypes") - public void init(NamedList config, SolrCore core) { - this.buckets = config.get(WEIGHT_BUCKETS) != null - ? Integer.parseInt(config.get(WEIGHT_BUCKETS).toString()) - : 10; - - this.exactMatchFirst = config.get(EXACT_MATCH_FIRST) != null - ? Boolean.valueOf(config.get(EXACT_MATCH_FIRST).toString()) - : true; - } - - /* */ - @Override - public void build(TermFreqIterator tfit) throws IOException { - // Buffer the input because we will need it twice: for calculating - // weights distribution and for the actual automata building. - List entries = Lists.newArrayList(); - while (tfit.hasNext()) { - String term = tfit.next(); - char [] termChars = new char [term.length() + 1]; // add padding for weight. - for (int i = 0; i < term.length(); i++) - termChars[i + 1] = term.charAt(i); - entries.add(new Entry(termChars, tfit.freq())); - } - - // Distribute weights into at most N buckets. This is a form of discretization to - // limit the number of possible weights so that they can be efficiently encoded in the - // automaton. - // - // It is assumed the distribution of weights is _linear_ so proportional division - // of [min, max] range will be enough here. Other approaches could be to sort - // weights and divide into proportional ranges. - if (entries.size() > 0) { - redistributeWeightsProportionalMinMax(entries, buckets); - encodeWeightPrefix(entries); - } - - // Build the automaton (includes input sorting) and cache root arcs in order from the highest, - // to the lowest weight. - this.automaton = buildAutomaton(entries); - cacheRootArcs(); - } - - /** - * Cache the root node's output arcs starting with completions with the highest weights. - */ - @SuppressWarnings("unchecked") - private void cacheRootArcs() throws IOException { - if (automaton != null) { - List> rootArcs = Lists.newArrayList(); - Arc arc = automaton.getFirstArc(new Arc()); - automaton.readFirstTargetArc(arc, arc); - while (true) { - rootArcs.add(new Arc().copyFrom(arc)); - if (arc.isLast()) - break; - automaton.readNextArc(arc); - } - - Collections.reverse(rootArcs); // we want highest weights first. - this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]); - } - } - - /** - * Not implemented. - */ - @Override - public boolean add(String key, Object value) { - // This implementation does not support ad-hoc additions (all input - // must be sorted for the builder). - return false; - } - - /** - * Get the (approximated) weight of a single key (if there is a perfect match - * for it in the automaton). - * - * @return Returns the approximated weight of the input key or null - * if not found. - */ - @Override - public Float get(String key) { - return getExactMatchStartingFromRootArc(0, key); - } - - /** - * Returns the first exact match by traversing root arcs, starting from - * the arc i. - * - * @param i The first root arc index in {@link #rootArcs} to consider when - * matching. - */ - private Float getExactMatchStartingFromRootArc(int i, String key) { - // Get the UTF-8 bytes representation of the input key. - try { - final FST.Arc scratch = new FST.Arc(); - for (; i < rootArcs.length; i++) { - final FST.Arc rootArc = rootArcs[i]; - final FST.Arc arc = scratch.copyFrom(rootArc); - - // Descend into the automaton using the key as prefix. - if (descendWithPrefix(arc, key)) { - automaton.readFirstTargetArc(arc, arc); - if (arc.label == FST.END_LABEL) { - // Prefix-encoded weight. - return rootArc.label / (float) buckets; - } - } - } - } catch (IOException e) { - // Should never happen, but anyway. - throw new RuntimeException(e); - } - - return null; - } - - /** - * Lookup autocomplete suggestions to key. - * - * @param key The prefix to which suggestions should be sought. - * @param onlyMorePopular Return most popular suggestions first. This is the default - * behavior for this implementation. Setting it to false has no effect (use - * constant term weights to sort alphabetically only). - * @param num At most this number of suggestions will be returned. - * @return Returns the suggestions, sorted by their approximated weight first (decreasing) - * and then alphabetically (utf16 codepoint order). - */ - @Override - public List lookup(String key, boolean onlyMorePopular, int num) { - if (key.length() == 0 || automaton == null) { - // Keep the result an ArrayList to keep calls monomorphic. - return EMPTY_RESULT; - } - - try { - if (!onlyMorePopular && rootArcs.length > 1) { - // We could emit a warning here (?). An optimal strategy for alphabetically sorted - // suggestions would be to add them with a constant weight -- this saves unnecessary - // traversals and sorting. - return lookupSortedAlphabetically(key, num); - } else { - return lookupSortedByWeight(key, num, true); - } - } catch (IOException e) { - // Should never happen, but anyway. - throw new RuntimeException(e); - } - } - - /** - * Lookup suggestions sorted alphabetically if weights are not constant. This - * is a workaround: in general, use constant weights for alphabetically sorted result. - */ - private List lookupSortedAlphabetically(String key, int num) throws IOException { - // Greedily get num results from each weight branch. - List res = lookupSortedByWeight(key, num, false); - - // Sort and trim. - Collections.sort(res, new Comparator() { - @Override - public int compare(LookupResult o1, LookupResult o2) { - return o1.key.compareTo(o2.key); - } - }); - if (res.size() > num) { - res = res.subList(0, num); - } - return res; - } - - /** - * Lookup suggestions sorted by weight (descending order). - * - * @param greedy If true, the routine terminates immediately when num - * suggestions have been collected. If false, it will collect suggestions from - * all weight arcs (needed for {@link #lookupSortedAlphabetically}. - */ - private ArrayList lookupSortedByWeight(String key, int num, boolean greedy) throws IOException { - final ArrayList res = new ArrayList(Math.min(10, num)); - final StringBuilder output = new StringBuilder(key); - final int matchLength = key.length() - 1; - - for (int i = 0; i < rootArcs.length; i++) { - final FST.Arc rootArc = rootArcs[i]; - final FST.Arc arc = new FST.Arc().copyFrom(rootArc); - - // Descend into the automaton using the key as prefix. - if (descendWithPrefix(arc, key)) { - // Prefix-encoded weight. - final float weight = rootArc.label / (float) buckets; - - // A subgraph starting from the current node has the completions - // of the key prefix. The arc we're at is the last key's byte, - // so we will collect it too. - output.setLength(matchLength); - if (collect(res, num, weight, output, arc) && greedy) { - // We have enough suggestion to return immediately. Keep on looking for an - // exact match, if requested. - if (exactMatchFirst) { - Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key); - if (exactMatchWeight != null) { - res.add(0, new LookupResult(key, exactMatchWeight)); - while (res.size() > num) { - res.remove(res.size() - 1); - } - } - } - break; - } - } - } - return res; - } - - /** - * Descend along the path starting at arc and going through - * bytes in utf8 argument. - * - * @param arc The starting arc. This argument is modified in-place. - * @param term The term to descend with. - * @return If true, arc will be set to the arc matching - * last byte of utf8. false is returned if no such - * prefix utf8 exists. - */ - private boolean descendWithPrefix(Arc arc, String term) throws IOException { - final int max = term.length(); - - for (int i = 0; i < max; i++) { - if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) { - // No matching prefixes, return an empty result. - return false; - } - } - - return true; - } - - /** - * Recursive collect lookup results from the automaton subgraph starting at arc. - * - * @param num Maximum number of results needed (early termination). - * @param weight Weight of all results found during this collection. - */ - private boolean collect(List res, int num, float weight, StringBuilder output, Arc arc) throws IOException { - output.append((char) arc.label); - - automaton.readFirstTargetArc(arc, arc); - while (true) { - if (arc.label == FST.END_LABEL) { - res.add(new LookupResult(output.toString(), weight)); - if (res.size() >= num) - return true; - } else { - int save = output.length(); - if (collect(res, num, weight, output, new Arc().copyFrom(arc))) { - return true; - } - output.setLength(save); - } - - if (arc.isLast()) { - break; - } - automaton.readNextArc(arc); - } - return false; - } - - /** - * Builds the final automaton from a list of entries. - */ - private FST buildAutomaton(List entries) throws IOException { - if (entries.size() == 0) - return null; - - // Sort by utf16 (raw char value) - final Comparator comp = new Comparator() { - public int compare(Entry o1, Entry o2) { - char [] ch1 = o1.term; - char [] ch2 = o2.term; - int len1 = ch1.length; - int len2 = ch2.length; - - int max = Math.min(len1, len2); - for (int i = 0; i < max; i++) { - int v = ch1[i] - ch2[i]; - if (v != 0) return v; - } - return len1 - len2; - } - }; - Collections.sort(entries, comp); - - // Avoid duplicated identical entries, if possible. This is required because - // it breaks automaton construction otherwise. - int len = entries.size(); - int j = 0; - for (int i = 1; i < len; i++) { - if (comp.compare(entries.get(j), entries.get(i)) != 0) { - entries.set(++j, entries.get(i)); - } - } - entries = entries.subList(0, j + 1); - - // Build the automaton. - final Outputs outputs = NoOutputs.getSingleton(); - final Object empty = outputs.getNoOutput(); - final Builder builder = - new Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs); - final IntsRef scratchIntsRef = new IntsRef(10); - for (Entry e : entries) { - final int termLength = scratchIntsRef.length = e.term.length; - - scratchIntsRef.grow(termLength); - final int [] ints = scratchIntsRef.ints; - final char [] chars = e.term; - for (int i = termLength; --i >= 0;) { - ints[i] = chars[i]; - } - builder.add(scratchIntsRef, empty); - } - return builder.finish(); - } - - /** - * Prepends the entry's weight to each entry, encoded as a single byte, so that the - * root automaton node fans out to all possible priorities, starting with the arc that has - * the highest weights. - */ - private void encodeWeightPrefix(List entries) { - for (Entry e : entries) { - int weight = (int) e.weight; - assert (weight >= 0 && weight <= buckets) : - "Weight out of range: " + weight + " [" + buckets + "]"; - - // There should be a single empty char reserved in front for the weight. - e.term[0] = (char) weight; - } - } - - /** - * Split [min, max] range into buckets, reassigning weights. Entries' weights are - * remapped to [0, buckets] range (so, buckets + 1 buckets, actually). - */ - private void redistributeWeightsProportionalMinMax(List entries, int buckets) { - float min = entries.get(0).weight; - float max = min; - for (Entry e : entries) { - min = Math.min(e.weight, min); - max = Math.max(e.weight, max); - } - - final float range = max - min; - for (Entry e : entries) { - e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor() - } - } - - /** - * Deserialization from disk. - */ - @Override - public synchronized boolean load(File storeDir) throws IOException { - File data = new File(storeDir, FILENAME); - if (!data.exists() || !data.canRead()) { - return false; - } - - InputStream is = new BufferedInputStream(new FileInputStream(data)); - try { - this.automaton = new FST(new InputStreamDataInput(is), NoOutputs.getSingleton()); - cacheRootArcs(); - } finally { - Closeables.closeQuietly(is); - } - return true; - } - - /** - * Serialization to disk. - */ - @Override - public synchronized boolean store(File storeDir) throws IOException { - if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { - return false; - } - - if (this.automaton == null) - return false; - - File data = new File(storeDir, FILENAME); - OutputStream os = new BufferedOutputStream(new FileOutputStream(data)); - try { - this.automaton.save(new OutputStreamDataOutput(os)); - } finally { - Closeables.closeQuietly(os); - } - - return true; - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java 2011-05-22 18:12:43.000000000 -0400 @@ -0,0 +1,60 @@ +package org.apache.solr.spelling.suggest.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.fst.FSTLookup; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.spelling.suggest.LookupFactory; + +/** + * Factory for {@link FSTLookup} + */ +public class FSTLookupFactory extends LookupFactory { + + /** + * The number of separate buckets for weights (discretization). The more buckets, + * the more fine-grained term weights (priorities) can be assigned. The speed of lookup + * will not decrease for prefixes which have highly-weighted completions (because these + * are filled-in first), but will decrease significantly for low-weighted terms (but + * these should be infrequent, so it is all right). + * + *

The number of buckets must be within [1, 255] range. + */ + public static final String WEIGHT_BUCKETS = "weightBuckets"; + + /** + * If true, exact suggestions are returned first, even if they are prefixes + * of other strings in the automaton (possibly with larger weights). + */ + public static final String EXACT_MATCH_FIRST = "exactMatchFirst"; + + @Override + public Lookup create(NamedList params, SolrCore core) { + int buckets = params.get(WEIGHT_BUCKETS) != null + ? Integer.parseInt(params.get(WEIGHT_BUCKETS).toString()) + : 10; + + boolean exactMatchFirst = params.get(EXACT_MATCH_FIRST) != null + ? Boolean.valueOf(params.get(EXACT_MATCH_FIRST).toString()) + : true; + + return new FSTLookup(buckets, exactMatchFirst); + } +} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/InputStreamDataInput.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/InputStreamDataInput.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/InputStreamDataInput.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/InputStreamDataInput.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,31 +0,0 @@ -package org.apache.solr.spelling.suggest.fst; - -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import org.apache.lucene.store.DataInput; -import com.google.common.io.ByteStreams; - -/** - * A {@link DataInput} wrapping a plain {@link InputStream}. - */ -public class InputStreamDataInput extends DataInput { - - private final InputStream is; - - public InputStreamDataInput(InputStream is) { - this.is = is; - } - - @Override - public byte readByte() throws IOException { - int v = is.read(); - if (v == -1) throw new EOFException(); - return (byte) v; - } - - @Override - public void readBytes(byte[] b, int offset, int len) throws IOException { - ByteStreams.readFully(is, b, offset, len); - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/OutputStreamDataOutput.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/OutputStreamDataOutput.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/OutputStreamDataOutput.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/OutputStreamDataOutput.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,28 +0,0 @@ -package org.apache.solr.spelling.suggest.fst; - -import java.io.IOException; -import java.io.OutputStream; - -import org.apache.lucene.store.DataOutput; - -/** - * A {@link DataOutput} wrapping a plain {@link OutputStream}. - */ -public class OutputStreamDataOutput extends DataOutput { - - private final OutputStream os; - - public OutputStreamDataOutput(OutputStream os) { - this.os = os; - } - - @Override - public void writeByte(byte b) throws IOException { - os.write(b); - } - - @Override - public void writeBytes(byte[] b, int offset, int length) throws IOException { - os.write(b, offset, length); - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookup.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookup.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookup.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookup.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,182 +0,0 @@ -package org.apache.solr.spelling.suggest.jaspell; - -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.solr.common.util.NamedList; -import org.apache.solr.core.SolrCore; -import org.apache.solr.spelling.suggest.Lookup; -import org.apache.solr.spelling.suggest.UnsortedTermFreqIteratorWrapper; -import org.apache.solr.spelling.suggest.jaspell.JaspellTernarySearchTrie.TSTNode; -import org.apache.solr.util.SortedIterator; -import org.apache.solr.util.TermFreqIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class JaspellLookup extends Lookup { - private static final Logger LOG = LoggerFactory.getLogger(JaspellLookup.class); - JaspellTernarySearchTrie trie = new JaspellTernarySearchTrie(); - private boolean usePrefix = true; - private int editDistance = 2; - - @Override - public void init(NamedList config, SolrCore core) { - LOG.info("init: " + config); - } - - @Override - public void build(TermFreqIterator tfit) throws IOException { - if (tfit instanceof SortedIterator) { - // make sure it's unsorted - tfit = new UnsortedTermFreqIteratorWrapper(tfit); - } - trie = new JaspellTernarySearchTrie(); - trie.setMatchAlmostDiff(editDistance); - while (tfit.hasNext()) { - String key = tfit.next(); - float freq = tfit.freq(); - if (key.length() == 0) { - continue; - } - trie.put(key, new Float(freq)); - } - } - - @Override - public boolean add(String key, Object value) { - trie.put(key, value); - // XXX - return false; - } - - @Override - public Object get(String key) { - return trie.get(key); - } - - @Override - public List lookup(String key, boolean onlyMorePopular, int num) { - List res = new ArrayList(); - List list; - int count = onlyMorePopular ? num * 2 : num; - if (usePrefix) { - list = trie.matchPrefix(key, count); - } else { - list = trie.matchAlmost(key, count); - } - if (list == null || list.size() == 0) { - return res; - - } - int maxCnt = Math.min(num, list.size()); - if (onlyMorePopular) { - LookupPriorityQueue queue = new LookupPriorityQueue(num); - for (String s : list) { - float freq = (Float)trie.get(s); - queue.insertWithOverflow(new LookupResult(s, freq)); - } - for (LookupResult lr : queue.getResults()) { - res.add(lr); - } - } else { - for (int i = 0; i < maxCnt; i++) { - String s = list.get(i); - float freq = (Float)trie.get(s); - res.add(new LookupResult(s, freq)); - } - } - return res; - } - - public static final String FILENAME = "jaspell.dat"; - private static final byte LO_KID = 0x01; - private static final byte EQ_KID = 0x02; - private static final byte HI_KID = 0x04; - private static final byte HAS_VALUE = 0x08; - - - @Override - public boolean load(File storeDir) throws IOException { - File data = new File(storeDir, FILENAME); - if (!data.exists() || !data.canRead()) { - return false; - } - DataInputStream in = new DataInputStream(new FileInputStream(data)); - TSTNode root = trie.new TSTNode('\0', null); - try { - readRecursively(in, root); - trie.setRoot(root); - } finally { - in.close(); - } - return true; - } - - private void readRecursively(DataInputStream in, TSTNode node) throws IOException { - node.splitchar = in.readChar(); - byte mask = in.readByte(); - if ((mask & HAS_VALUE) != 0) { - node.data = new Float(in.readFloat()); - } - if ((mask & LO_KID) != 0) { - TSTNode kid = trie.new TSTNode('\0', node); - node.relatives[TSTNode.LOKID] = kid; - readRecursively(in, kid); - } - if ((mask & EQ_KID) != 0) { - TSTNode kid = trie.new TSTNode('\0', node); - node.relatives[TSTNode.EQKID] = kid; - readRecursively(in, kid); - } - if ((mask & HI_KID) != 0) { - TSTNode kid = trie.new TSTNode('\0', node); - node.relatives[TSTNode.HIKID] = kid; - readRecursively(in, kid); - } - } - - @Override - public boolean store(File storeDir) throws IOException { - if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { - return false; - } - TSTNode root = trie.getRoot(); - if (root == null) { // empty tree - return false; - } - File data = new File(storeDir, FILENAME); - DataOutputStream out = new DataOutputStream(new FileOutputStream(data)); - try { - writeRecursively(out, root); - out.flush(); - } finally { - out.close(); - } - return true; - } - - private void writeRecursively(DataOutputStream out, TSTNode node) throws IOException { - if (node == null) { - return; - } - out.writeChar(node.splitchar); - byte mask = 0; - if (node.relatives[TSTNode.LOKID] != null) mask |= LO_KID; - if (node.relatives[TSTNode.EQKID] != null) mask |= EQ_KID; - if (node.relatives[TSTNode.HIKID] != null) mask |= HI_KID; - if (node.data != null) mask |= HAS_VALUE; - out.writeByte(mask); - if (node.data != null) { - out.writeFloat((Float)node.data); - } - writeRecursively(out, node.relatives[TSTNode.LOKID]); - writeRecursively(out, node.relatives[TSTNode.EQKID]); - writeRecursively(out, node.relatives[TSTNode.HIKID]); - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookupFactory.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookupFactory.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookupFactory.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookupFactory.java 2011-05-22 18:07:01.000000000 -0400 @@ -0,0 +1,39 @@ +package org.apache.solr.spelling.suggest.jaspell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.jaspell.JaspellLookup; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.spelling.suggest.LookupFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Factory for {@link JaspellLookup} + */ +public class JaspellLookupFactory extends LookupFactory { + private static final Logger LOG = LoggerFactory.getLogger(JaspellLookup.class); + + @Override + public Lookup create(NamedList params, SolrCore core) { + LOG.info("init: " + params); + return new JaspellLookup(); + } +} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellTernarySearchTrie.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellTernarySearchTrie.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellTernarySearchTrie.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellTernarySearchTrie.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,866 +0,0 @@ -package org.apache.solr.spelling.suggest.jaspell; - -/** - * Copyright (c) 2005 Bruno Martins - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the organization nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - */ - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.List; -import java.util.Vector; -import java.util.zip.GZIPInputStream; - -/** - * Implementation of a Ternary Search Trie, a data structure for storing - * String objects that combines the compact size of a binary search - * tree with the speed of a digital search trie, and is therefore ideal for - * practical use in sorting and searching data.

- *

- * - * This data structure is faster than hashing for many typical search problems, - * and supports a broader range of useful problems and operations. Ternary - * searches are faster than hashing and more powerful, too. - *

- *

- * - * The theory of ternary search trees was described at a symposium in 1997 (see - * "Fast Algorithms for Sorting and Searching Strings," by J.L. Bentley and R. - * Sedgewick, Proceedings of the 8th Annual ACM-SIAM Symposium on Discrete - * Algorithms, January 1997). Algorithms in C, Third Edition, by Robert - * Sedgewick (Addison-Wesley, 1998) provides yet another view of ternary search - * trees. - * - * @author Bruno Martins - * - */ -public class JaspellTernarySearchTrie { - - /** - * An inner class of Ternary Search Trie that represents a node in the trie. - */ - protected final class TSTNode { - - /** Index values for accessing relatives array. */ - protected final static int PARENT = 0, LOKID = 1, EQKID = 2, HIKID = 3; - - /** The key to the node. */ - protected Object data; - - /** The relative nodes. */ - protected TSTNode[] relatives = new TSTNode[4]; - - /** The char used in the split. */ - protected char splitchar; - - /** - * Constructor method. - * - *@param splitchar - * The char used in the split. - *@param parent - * The parent node. - */ - protected TSTNode(char splitchar, TSTNode parent) { - this.splitchar = splitchar; - relatives[PARENT] = parent; - } - } - - /** - * Compares characters by alfabetical order. - * - *@param cCompare2 - * The first char in the comparison. - *@param cRef - * The second char in the comparison. - *@return A negative number, 0 or a positive number if the second char is - * less, equal or greater. - */ - private static int compareCharsAlphabetically(char cCompare2, char cRef) { - return Character.toLowerCase(cCompare2) - Character.toLowerCase(cRef); - } - - /* what follows is the original Jaspell code. - private static int compareCharsAlphabetically(int cCompare2, int cRef) { - int cCompare = 0; - if (cCompare2 >= 65) { - if (cCompare2 < 89) { - cCompare = (2 * cCompare2) - 65; - } else if (cCompare2 < 97) { - cCompare = cCompare2 + 24; - } else if (cCompare2 < 121) { - cCompare = (2 * cCompare2) - 128; - } else cCompare = cCompare2; - } else cCompare = cCompare2; - if (cRef < 65) { - return cCompare - cRef; - } - if (cRef < 89) { - return cCompare - ((2 * cRef) - 65); - } - if (cRef < 97) { - return cCompare - (cRef + 24); - } - if (cRef < 121) { - return cCompare - ((2 * cRef) - 128); - } - return cCompare - cRef; - } - */ - - /** - * The default number of values returned by the matchAlmost - * method. - */ - private int defaultNumReturnValues = -1; - - /** - * the number of differences allowed in a call to the - * matchAlmostKey method. - */ - private int matchAlmostDiff; - - /** The base node in the trie. */ - private TSTNode rootNode; - - /** - * Constructs an empty Ternary Search Trie. - */ - public JaspellTernarySearchTrie() { - } - - // for loading - void setRoot(TSTNode newRoot) { - rootNode = newRoot; - } - - // for saving - TSTNode getRoot() { - return rootNode; - } - - /** - * Constructs a Ternary Search Trie and loads data from a File - * into the Trie. The file is a normal text document, where each line is of - * the form word TAB float. - * - *@param file - * The File with the data to load into the Trie. - *@exception IOException - * A problem occured while reading the data. - */ - public JaspellTernarySearchTrie(File file) throws IOException { - this(file, false); - } - - /** - * Constructs a Ternary Search Trie and loads data from a File - * into the Trie. The file is a normal text document, where each line is of - * the form "word TAB float". - * - *@param file - * The File with the data to load into the Trie. - *@param compression - * If true, the file is compressed with the GZIP algorithm, and if - * false, the file is a normal text document. - *@exception IOException - * A problem occured while reading the data. - */ - public JaspellTernarySearchTrie(File file, boolean compression) - throws IOException { - this(); - BufferedReader in; - if (compression) - in = new BufferedReader(new InputStreamReader(new GZIPInputStream( - new FileInputStream(file)))); - else in = new BufferedReader(new InputStreamReader((new FileInputStream( - file)))); - String word; - int pos; - Float occur, one = new Float(1); - int numWords = 0; - while ((word = in.readLine()) != null) { - numWords++; - pos = word.indexOf("\t"); - occur = one; - if (pos != -1) { - occur = Float.parseFloat(word.substring(pos + 1).trim()); - word = word.substring(0, pos); - } - String key = word.toLowerCase(); - if (rootNode == null) { - rootNode = new TSTNode(key.charAt(0), null); - } - TSTNode node = null; - if (key.length() > 0 && rootNode != null) { - TSTNode currentNode = rootNode; - int charIndex = 0; - while (true) { - if (currentNode == null) break; - int charComp = compareCharsAlphabetically(key.charAt(charIndex), - currentNode.splitchar); - if (charComp == 0) { - charIndex++; - if (charIndex == key.length()) { - node = currentNode; - break; - } - currentNode = currentNode.relatives[TSTNode.EQKID]; - } else if (charComp < 0) { - currentNode = currentNode.relatives[TSTNode.LOKID]; - } else { - currentNode = currentNode.relatives[TSTNode.HIKID]; - } - } - Float occur2 = null; - if (node != null) occur2 = ((Float) (node.data)); - if (occur2 != null) { - occur += occur2.floatValue(); - } - currentNode = getOrCreateNode(word.trim().toLowerCase()); - currentNode.data = occur; - } - } - in.close(); - } - - /** - * Deletes the node passed in as an argument. If this node has non-null data, - * then both the node and the data will be deleted. It also deletes any other - * nodes in the trie that are no longer needed after the deletion of the node. - * - *@param nodeToDelete - * The node to delete. - */ - private void deleteNode(TSTNode nodeToDelete) { - if (nodeToDelete == null) { - return; - } - nodeToDelete.data = null; - while (nodeToDelete != null) { - nodeToDelete = deleteNodeRecursion(nodeToDelete); - // deleteNodeRecursion(nodeToDelete); - } - } - - /** - * Recursively visits each node to be deleted. - * - * To delete a node, first set its data to null, then pass it into this - * method, then pass the node returned by this method into this method (make - * sure you don't delete the data of any of the nodes returned from this - * method!) and continue in this fashion until the node returned by this - * method is null. - * - * The TSTNode instance returned by this method will be next node to be - * operated on by deleteNodeRecursion (This emulates recursive - * method call while avoiding the JVM overhead normally associated with a - * recursive method.) - * - *@param currentNode - * The node to delete. - *@return The next node to be called in deleteNodeRecursion. - */ - private TSTNode deleteNodeRecursion(TSTNode currentNode) { - if (currentNode == null) { - return null; - } - if (currentNode.relatives[TSTNode.EQKID] != null - || currentNode.data != null) { - return null; - } - // can't delete this node if it has a non-null eq kid or data - TSTNode currentParent = currentNode.relatives[TSTNode.PARENT]; - boolean lokidNull = currentNode.relatives[TSTNode.LOKID] == null; - boolean hikidNull = currentNode.relatives[TSTNode.HIKID] == null; - int childType; - if (currentParent.relatives[TSTNode.LOKID] == currentNode) { - childType = TSTNode.LOKID; - } else if (currentParent.relatives[TSTNode.EQKID] == currentNode) { - childType = TSTNode.EQKID; - } else if (currentParent.relatives[TSTNode.HIKID] == currentNode) { - childType = TSTNode.HIKID; - } else { - rootNode = null; - return null; - } - if (lokidNull && hikidNull) { - currentParent.relatives[childType] = null; - return currentParent; - } - if (lokidNull) { - currentParent.relatives[childType] = currentNode.relatives[TSTNode.HIKID]; - currentNode.relatives[TSTNode.HIKID].relatives[TSTNode.PARENT] = currentParent; - return currentParent; - } - if (hikidNull) { - currentParent.relatives[childType] = currentNode.relatives[TSTNode.LOKID]; - currentNode.relatives[TSTNode.LOKID].relatives[TSTNode.PARENT] = currentParent; - return currentParent; - } - int deltaHi = currentNode.relatives[TSTNode.HIKID].splitchar - - currentNode.splitchar; - int deltaLo = currentNode.splitchar - - currentNode.relatives[TSTNode.LOKID].splitchar; - int movingKid; - TSTNode targetNode; - if (deltaHi == deltaLo) { - if (Math.random() < 0.5) { - deltaHi++; - } else { - deltaLo++; - } - } - if (deltaHi > deltaLo) { - movingKid = TSTNode.HIKID; - targetNode = currentNode.relatives[TSTNode.LOKID]; - } else { - movingKid = TSTNode.LOKID; - targetNode = currentNode.relatives[TSTNode.HIKID]; - } - while (targetNode.relatives[movingKid] != null) { - targetNode = targetNode.relatives[movingKid]; - } - targetNode.relatives[movingKid] = currentNode.relatives[movingKid]; - currentParent.relatives[childType] = targetNode; - targetNode.relatives[TSTNode.PARENT] = currentParent; - if (!lokidNull) { - currentNode.relatives[TSTNode.LOKID] = null; - } - if (!hikidNull) { - currentNode.relatives[TSTNode.HIKID] = null; - } - return currentParent; - } - - /** - * Retrieve the object indexed by a key. - * - *@param key - * A String index. - *@return The object retrieved from the Ternary Search Trie. - */ - public Object get(String key) { - TSTNode node = getNode(key.trim().toLowerCase()); - if (node == null) { - return null; - } - return node.data; - } - - /** - * Retrieve the Float indexed by key, increment it by one unit - * and store the new Float. - * - *@param key - * A String index. - *@return The Float retrieved from the Ternary Search Trie. - */ - public Float getAndIncrement(String key) { - String key2 = key.trim().toLowerCase(); - TSTNode node = getNode(key2); - if (node == null) { - return null; - } - Float aux = (Float) (node.data); - if (aux == null) { - aux = new Float(1); - } else { - aux = new Float(aux.intValue() + 1); - } - put(key2, aux); - return aux; - } - - /** - * Returns the key that indexes the node argument. - * - *@param node - * The node whose index is to be calculated. - *@return The String that indexes the node argument. - */ - protected String getKey(TSTNode node) { - StringBuffer getKeyBuffer = new StringBuffer(); - getKeyBuffer.setLength(0); - getKeyBuffer.append("" + node.splitchar); - TSTNode currentNode; - TSTNode lastNode; - currentNode = node.relatives[TSTNode.PARENT]; - lastNode = node; - while (currentNode != null) { - if (currentNode.relatives[TSTNode.EQKID] == lastNode) { - getKeyBuffer.append("" + currentNode.splitchar); - } - lastNode = currentNode; - currentNode = currentNode.relatives[TSTNode.PARENT]; - } - getKeyBuffer.reverse(); - return getKeyBuffer.toString(); - } - - /** - * Returns the node indexed by key, or null if that node doesn't - * exist. Search begins at root node. - * - *@param key - * A String that indexes the node that is returned. - *@return The node object indexed by key. This object is an instance of an - * inner class named TernarySearchTrie.TSTNode. - */ - public TSTNode getNode(String key) { - return getNode(key, rootNode); - } - - /** - * Returns the node indexed by key, or null if that node doesn't - * exist. The search begins at root node. - * - *@param key2 - * A String that indexes the node that is returned. - *@param startNode - * The top node defining the subtrie to be searched. - *@return The node object indexed by key. This object is an instance of an - * inner class named TernarySearchTrie.TSTNode. - */ - protected TSTNode getNode(String key2, TSTNode startNode) { - String key = key2.trim().toLowerCase(); - if (key == null || startNode == null || key.length() == 0) { - return null; - } - TSTNode currentNode = startNode; - int charIndex = 0; - while (true) { - if (currentNode == null) { - return null; - } - int charComp = compareCharsAlphabetically(key.charAt(charIndex), - currentNode.splitchar); - if (charComp == 0) { - charIndex++; - if (charIndex == key.length()) { - return currentNode; - } - currentNode = currentNode.relatives[TSTNode.EQKID]; - } else if (charComp < 0) { - currentNode = currentNode.relatives[TSTNode.LOKID]; - } else { - currentNode = currentNode.relatives[TSTNode.HIKID]; - } - } - } - - /** - * Returns the node indexed by key, creating that node if it doesn't exist, - * and creating any required intermediate nodes if they don't exist. - * - *@param key - * A String that indexes the node that is returned. - *@return The node object indexed by key. This object is an instance of an - * inner class named TernarySearchTrie.TSTNode. - *@exception NullPointerException - * If the key is null. - *@exception IllegalArgumentException - * If the key is an empty String. - */ - protected TSTNode getOrCreateNode(String key) throws NullPointerException, - IllegalArgumentException { - if (key == null) { - throw new NullPointerException( - "attempt to get or create node with null key"); - } - if (key.length() == 0) { - throw new IllegalArgumentException( - "attempt to get or create node with key of zero length"); - } - if (rootNode == null) { - rootNode = new TSTNode(key.charAt(0), null); - } - TSTNode currentNode = rootNode; - int charIndex = 0; - while (true) { - int charComp = compareCharsAlphabetically(key.charAt(charIndex), - currentNode.splitchar); - if (charComp == 0) { - charIndex++; - if (charIndex == key.length()) { - return currentNode; - } - if (currentNode.relatives[TSTNode.EQKID] == null) { - currentNode.relatives[TSTNode.EQKID] = new TSTNode(key - .charAt(charIndex), currentNode); - } - currentNode = currentNode.relatives[TSTNode.EQKID]; - } else if (charComp < 0) { - if (currentNode.relatives[TSTNode.LOKID] == null) { - currentNode.relatives[TSTNode.LOKID] = new TSTNode(key - .charAt(charIndex), currentNode); - } - currentNode = currentNode.relatives[TSTNode.LOKID]; - } else { - if (currentNode.relatives[TSTNode.HIKID] == null) { - currentNode.relatives[TSTNode.HIKID] = new TSTNode(key - .charAt(charIndex), currentNode); - } - currentNode = currentNode.relatives[TSTNode.HIKID]; - } - } - } - - /** - * Returns a List of keys that almost match the argument key. - * Keys returned will have exactly diff characters that do not match the - * target key, where diff is equal to the last value passed in as an argument - * to the setMatchAlmostDiff method. - *

- * If the matchAlmost method is called before the - * setMatchAlmostDiff method has been called for the first time, - * then diff = 0. - * - *@param key - * The target key. - *@return A List with the results. - */ - public List matchAlmost(String key) { - return matchAlmost(key, defaultNumReturnValues); - } - - /** - * Returns a List of keys that almost match the argument key. - * Keys returned will have exactly diff characters that do not match the - * target key, where diff is equal to the last value passed in as an argument - * to the setMatchAlmostDiff method. - *

- * If the matchAlmost method is called before the - * setMatchAlmostDiff method has been called for the first time, - * then diff = 0. - * - *@param key - * The target key. - *@param numReturnValues - * The maximum number of values returned by this method. - *@return A List with the results - */ - public List matchAlmost(String key, int numReturnValues) { - return matchAlmostRecursion(rootNode, 0, matchAlmostDiff, key, - ((numReturnValues < 0) ? -1 : numReturnValues), new Vector(), false); - } - - /** - * Recursivelly vists the nodes in order to find the ones that almost match a - * given key. - * - *@param currentNode - * The current node. - *@param charIndex - * The current char. - *@param d - * The number of differences so far. - *@param matchAlmostNumReturnValues - * The maximum number of values in the result List. - *@param matchAlmostResult2 - * The results so far. - *@param upTo - * If true all keys having up to and including matchAlmostDiff - * mismatched letters will be included in the result (including a key - * that is exactly the same as the target string) otherwise keys will - * be included in the result only if they have exactly - * matchAlmostDiff number of mismatched letters. - *@param matchAlmostKey - * The key being searched. - *@return A List with the results. - */ - private List matchAlmostRecursion(TSTNode currentNode, int charIndex, - int d, String matchAlmostKey, int matchAlmostNumReturnValues, - List matchAlmostResult2, boolean upTo) { - if ((currentNode == null) - || (matchAlmostNumReturnValues != -1 && matchAlmostResult2.size() >= matchAlmostNumReturnValues) - || (d < 0) || (charIndex >= matchAlmostKey.length())) { - return matchAlmostResult2; - } - int charComp = compareCharsAlphabetically(matchAlmostKey.charAt(charIndex), - currentNode.splitchar); - List matchAlmostResult = matchAlmostResult2; - if ((d > 0) || (charComp < 0)) { - matchAlmostResult = matchAlmostRecursion( - currentNode.relatives[TSTNode.LOKID], charIndex, d, - matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, - upTo); - } - int nextD = (charComp == 0) ? d : d - 1; - boolean cond = (upTo) ? (nextD >= 0) : (nextD == 0); - if ((matchAlmostKey.length() == charIndex + 1) && cond - && (currentNode.data != null)) { - matchAlmostResult.add(getKey(currentNode)); - } - matchAlmostResult = matchAlmostRecursion( - currentNode.relatives[TSTNode.EQKID], charIndex + 1, nextD, - matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, upTo); - if ((d > 0) || (charComp > 0)) { - matchAlmostResult = matchAlmostRecursion( - currentNode.relatives[TSTNode.HIKID], charIndex, d, - matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, - upTo); - } - return matchAlmostResult; - } - - /** - * Returns an alphabetical List of all keys in the trie that - * begin with a given prefix. Only keys for nodes having non-null data are - * included in the List. - * - *@param prefix - * Each key returned from this method will begin with the characters - * in prefix. - *@return A List with the results. - */ - public List matchPrefix(String prefix) { - return matchPrefix(prefix, defaultNumReturnValues); - } - - /** - * Returns an alphabetical List of all keys in the trie that - * begin with a given prefix. Only keys for nodes having non-null data are - * included in the List. - * - *@param prefix - * Each key returned from this method will begin with the characters - * in prefix. - *@param numReturnValues - * The maximum number of values returned from this method. - *@return A List with the results - */ - public List matchPrefix(String prefix, int numReturnValues) { - Vector sortKeysResult = new Vector(); - TSTNode startNode = getNode(prefix); - if (startNode == null) { - return sortKeysResult; - } - if (startNode.data != null) { - sortKeysResult.addElement(getKey(startNode)); - } - return sortKeysRecursion(startNode.relatives[TSTNode.EQKID], - ((numReturnValues < 0) ? -1 : numReturnValues), sortKeysResult); - } - - /** - * Returns the number of nodes in the trie that have non-null data. - * - *@return The number of nodes in the trie that have non-null data. - */ - public int numDataNodes() { - return numDataNodes(rootNode); - } - - /** - * Returns the number of nodes in the subtrie below and including the starting - * node. The method counts only nodes that have non-null data. - * - *@param startingNode - * The top node of the subtrie. the node that defines the subtrie. - *@return The total number of nodes in the subtrie. - */ - protected int numDataNodes(TSTNode startingNode) { - return recursiveNodeCalculator(startingNode, true, 0); - } - - /** - * Returns the total number of nodes in the trie. The method counts nodes - * whether or not they have data. - * - *@return The total number of nodes in the trie. - */ - public int numNodes() { - return numNodes(rootNode); - } - - /** - * Returns the total number of nodes in the subtrie below and including the - * starting Node. The method counts nodes whether or not they have data. - * - *@param startingNode - * The top node of the subtrie. The node that defines the subtrie. - *@return The total number of nodes in the subtrie. - */ - protected int numNodes(TSTNode startingNode) { - return recursiveNodeCalculator(startingNode, false, 0); - } - - /** - * Stores a value in the trie. The value may be retrieved using the key. - * - *@param key - * A String that indexes the object to be stored. - *@param value - * The object to be stored in the Trie. - */ - public void put(String key, Object value) { - getOrCreateNode(key.trim().toLowerCase()).data = value; - } - - /** - * Recursivelly visists each node to calculate the number of nodes. - * - *@param currentNode - * The current node. - *@param checkData - * If true we check the data to be different of null. - *@param numNodes2 - * The number of nodes so far. - *@return The number of nodes accounted. - */ - private int recursiveNodeCalculator(TSTNode currentNode, boolean checkData, - int numNodes2) { - if (currentNode == null) { - return numNodes2; - } - int numNodes = recursiveNodeCalculator( - currentNode.relatives[TSTNode.LOKID], checkData, numNodes2); - numNodes = recursiveNodeCalculator(currentNode.relatives[TSTNode.EQKID], - checkData, numNodes); - numNodes = recursiveNodeCalculator(currentNode.relatives[TSTNode.HIKID], - checkData, numNodes); - if (checkData) { - if (currentNode.data != null) { - numNodes++; - } - } else { - numNodes++; - } - return numNodes; - } - - /** - * Removes the value indexed by key. Also removes all nodes that are rendered - * unnecessary by the removal of this data. - * - *@param key - * A string that indexes the object to be removed from - * the Trie. - */ - public void remove(String key) { - deleteNode(getNode(key.trim().toLowerCase())); - } - - /** - * Sets the number of characters by which words can differ from target word - * when calling the matchAlmost method. - *

- * Arguments less than 0 will set the char difference to 0, and arguments - * greater than 3 will set the char difference to 3. - * - *@param diff - * The number of characters by which words can differ from target - * word. - */ - public void setMatchAlmostDiff(int diff) { - if (diff < 0) { - matchAlmostDiff = 0; - } else if (diff > 3) { - matchAlmostDiff = 3; - } else { - matchAlmostDiff = diff; - } - } - - /** - * Sets the default maximum number of values returned from the - * matchPrefix and matchAlmost methods. - *

- * The value should be set this to -1 to get an unlimited number of return - * values. note that the methods mentioned above provide overloaded versions - * that allow you to specify the maximum number of return values, in which - * case this value is temporarily overridden. - * - **@param num - * The number of values that will be returned when calling the - * methods above. - */ - public void setNumReturnValues(int num) { - defaultNumReturnValues = (num < 0) ? -1 : num; - } - - /** - * Returns keys sorted in alphabetical order. This includes the start Node and - * all nodes connected to the start Node. - *

- * The number of keys returned is limited to numReturnValues. To get a list - * that isn't limited in size, set numReturnValues to -1. - * - *@param startNode - * The top node defining the subtrie to be searched. - *@param numReturnValues - * The maximum number of values returned from this method. - *@return A List with the results. - */ - protected List sortKeys(TSTNode startNode, int numReturnValues) { - return sortKeysRecursion(startNode, ((numReturnValues < 0) ? -1 - : numReturnValues), new Vector()); - } - - /** - * Returns keys sorted in alphabetical order. This includes the current Node - * and all nodes connected to the current Node. - *

- * Sorted keys will be appended to the end of the resulting List. - * The result may be empty when this method is invoked, but may not be - * null. - * - *@param currentNode - * The current node. - *@param sortKeysNumReturnValues - * The maximum number of values in the result. - *@param sortKeysResult2 - * The results so far. - *@return A List with the results. - */ - private List sortKeysRecursion(TSTNode currentNode, - int sortKeysNumReturnValues, List sortKeysResult2) { - if (currentNode == null) { - return sortKeysResult2; - } - List sortKeysResult = sortKeysRecursion( - currentNode.relatives[TSTNode.LOKID], sortKeysNumReturnValues, - sortKeysResult2); - if (sortKeysNumReturnValues != -1 - && sortKeysResult.size() >= sortKeysNumReturnValues) { - return sortKeysResult; - } - if (currentNode.data != null) { - sortKeysResult.add(getKey(currentNode)); - } - sortKeysResult = sortKeysRecursion(currentNode.relatives[TSTNode.EQKID], - sortKeysNumReturnValues, sortKeysResult); - return sortKeysRecursion(currentNode.relatives[TSTNode.HIKID], - sortKeysNumReturnValues, sortKeysResult); - } - -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTAutocomplete.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTAutocomplete.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTAutocomplete.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTAutocomplete.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,142 +0,0 @@ -package org.apache.solr.spelling.suggest.tst; - -import java.util.*; - -public class TSTAutocomplete { - - /** - * Inserting keys in TST in the order middle,small,big (lexicographic measure) - * recursively creates a balanced tree which reduces insertion and search - * times significantly. - * - * @param tokens - * Sorted list of keys to be inserted in TST. - * @param lo - * stores the lower index of current list. - * @param hi - * stores the higher index of current list. - * @param root - * a reference object to root of TST. - */ - public void balancedTree(Object[] tokens, Object[] vals, int lo, int hi, - TernaryTreeNode root) { - if (lo > hi) return; - int mid = (lo + hi) / 2; - root = insert(root, (String) tokens[mid], vals[mid], 0); - balancedTree(tokens, vals, lo, mid - 1, root); - balancedTree(tokens, vals, mid + 1, hi, root); - } - - /** - * Inserts a key in TST creating a series of Binary Search Trees at each node. - * The key is actually stored across the eqKid of each node in a successive - * manner. - * - * @param currentNode - * a reference node where the insertion will take currently. - * @param s - * key to be inserted in TST. - * @param x - * index of character in key to be inserted currently. - * @return currentNode The new reference to root node of TST - */ - public TernaryTreeNode insert(TernaryTreeNode currentNode, String s, - Object val, int x) { - if (s == null || s.length() <= x) { - return currentNode; - } - if (currentNode == null) { - TernaryTreeNode newNode = new TernaryTreeNode(); - newNode.splitchar = s.charAt(x); - currentNode = newNode; - if (x < s.length() - 1) { - currentNode.eqKid = insert(currentNode.eqKid, s, val, x + 1); - } else { - currentNode.token = s; - currentNode.val = val; - return currentNode; - } - } else if (currentNode.splitchar > s.charAt(x)) { - currentNode.loKid = insert(currentNode.loKid, s, val, x); - } else if (currentNode.splitchar == s.charAt(x)) { - if (x < s.length() - 1) { - currentNode.eqKid = insert(currentNode.eqKid, s, val, x + 1); - } else { - currentNode.token = s; - currentNode.val = val; - return currentNode; - } - } else { - currentNode.hiKid = insert(currentNode.hiKid, s, val, x); - } - return currentNode; - } - - /** - * Auto-completes a given prefix query using Depth-First Search with the end - * of prefix as source node each time finding a new leaf to get a complete key - * to be added in the suggest list. - * - * @param root - * a reference to root node of TST. - * @param s - * prefix query to be auto-completed. - * @param x - * index of current character to be searched while traversing through - * the prefix in TST. - * @return suggest list of auto-completed keys for the given prefix query. - */ - public ArrayList prefixCompletion(TernaryTreeNode root, - String s, int x) { - - TernaryTreeNode p = root; - ArrayList suggest = new ArrayList(); - - while (p != null) { - if (s.charAt(x) < p.splitchar) { - p = p.loKid; - } else if (s.charAt(x) == p.splitchar) { - if (x == s.length() - 1) { - break; - } else { - x++; - } - p = p.eqKid; - } else { - p = p.hiKid; - } - } - - if (p == null) return suggest; - if (p.eqKid == null && p.token == null) return suggest; - if (p.eqKid == null && p.token != null) { - suggest.add(p); - return suggest; - } - - if (p.token != null) { - suggest.add(p); - } - p = p.eqKid; - - Stack st = new Stack(); - st.push(p); - while (!st.empty()) { - TernaryTreeNode top = st.peek(); - st.pop(); - if (top.token != null) { - suggest.add(top); - } - if (top.eqKid != null) { - st.push(top.eqKid); - } - if (top.loKid != null) { - st.push(top.loKid); - } - if (top.hiKid != null) { - st.push(top.hiKid); - } - } - return suggest; - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookup.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookup.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookup.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookup.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,180 +0,0 @@ -package org.apache.solr.spelling.suggest.tst; - -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.solr.common.util.NamedList; -import org.apache.solr.core.SolrCore; -import org.apache.solr.spelling.suggest.Lookup; -import org.apache.solr.spelling.suggest.SortedTermFreqIteratorWrapper; -import org.apache.solr.util.SortedIterator; -import org.apache.solr.util.TermFreqIterator; - -public class TSTLookup extends Lookup { - TernaryTreeNode root = new TernaryTreeNode(); - TSTAutocomplete autocomplete = new TSTAutocomplete(); - - @Override - public void init(NamedList config, SolrCore core) { - } - - @Override - public void build(TermFreqIterator tfit) throws IOException { - root = new TernaryTreeNode(); - // buffer first - if (!(tfit instanceof SortedIterator)) { - // make sure it's sorted - tfit = new SortedTermFreqIteratorWrapper(tfit); - } - - ArrayList tokens = new ArrayList(); - ArrayList vals = new ArrayList(); - while (tfit.hasNext()) { - tokens.add(tfit.next()); - vals.add(new Float(tfit.freq())); - } - autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root); - } - - @Override - public boolean add(String key, Object value) { - autocomplete.insert(root, key, value, 0); - // XXX we don't know if a new node was created - return true; - } - - @Override - public Object get(String key) { - List list = autocomplete.prefixCompletion(root, key, 0); - if (list == null || list.isEmpty()) { - return null; - } - for (TernaryTreeNode n : list) { - if (n.token.equals(key)) { - return n.val; - } - } - return null; - } - - @Override - public List lookup(String key, boolean onlyMorePopular, int num) { - List list = autocomplete.prefixCompletion(root, key, 0); - List res = new ArrayList(); - if (list == null || list.size() == 0) { - return res; - } - int maxCnt = Math.min(num, list.size()); - if (onlyMorePopular) { - LookupPriorityQueue queue = new LookupPriorityQueue(num); - for (TernaryTreeNode ttn : list) { - queue.insertWithOverflow(new LookupResult(ttn.token, (Float)ttn.val)); - } - for (LookupResult lr : queue.getResults()) { - res.add(lr); - } - } else { - for (int i = 0; i < maxCnt; i++) { - TernaryTreeNode ttn = list.get(i); - res.add(new LookupResult(ttn.token, (Float)ttn.val)); - } - } - return res; - } - - public static final String FILENAME = "tst.dat"; - - private static final byte LO_KID = 0x01; - private static final byte EQ_KID = 0x02; - private static final byte HI_KID = 0x04; - private static final byte HAS_TOKEN = 0x08; - private static final byte HAS_VALUE = 0x10; - - @Override - public synchronized boolean load(File storeDir) throws IOException { - File data = new File(storeDir, FILENAME); - if (!data.exists() || !data.canRead()) { - return false; - } - DataInputStream in = new DataInputStream(new FileInputStream(data)); - root = new TernaryTreeNode(); - try { - readRecursively(in, root); - } finally { - in.close(); - } - return true; - } - - // pre-order traversal - private void readRecursively(DataInputStream in, TernaryTreeNode node) throws IOException { - node.splitchar = in.readChar(); - byte mask = in.readByte(); - if ((mask & HAS_TOKEN) != 0) { - node.token = in.readUTF(); - } - if ((mask & HAS_VALUE) != 0) { - node.val = new Float(in.readFloat()); - } - if ((mask & LO_KID) != 0) { - node.loKid = new TernaryTreeNode(); - readRecursively(in, node.loKid); - } - if ((mask & EQ_KID) != 0) { - node.eqKid = new TernaryTreeNode(); - readRecursively(in, node.eqKid); - } - if ((mask & HI_KID) != 0) { - node.hiKid = new TernaryTreeNode(); - readRecursively(in, node.hiKid); - } - } - - @Override - public synchronized boolean store(File storeDir) throws IOException { - if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { - return false; - } - File data = new File(storeDir, FILENAME); - DataOutputStream out = new DataOutputStream(new FileOutputStream(data)); - try { - writeRecursively(out, root); - out.flush(); - } finally { - out.close(); - } - return true; - } - - // pre-order traversal - private void writeRecursively(DataOutputStream out, TernaryTreeNode node) throws IOException { - // write out the current node - out.writeChar(node.splitchar); - // prepare a mask of kids - byte mask = 0; - if (node.eqKid != null) mask |= EQ_KID; - if (node.loKid != null) mask |= LO_KID; - if (node.hiKid != null) mask |= HI_KID; - if (node.token != null) mask |= HAS_TOKEN; - if (node.val != null) mask |= HAS_VALUE; - out.writeByte(mask); - if (node.token != null) out.writeUTF(node.token); - if (node.val != null) out.writeFloat((Float)node.val); - // recurse and write kids - if (node.loKid != null) { - writeRecursively(out, node.loKid); - } - if (node.eqKid != null) { - writeRecursively(out, node.eqKid); - } - if (node.hiKid != null) { - writeRecursively(out, node.hiKid); - } - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookupFactory.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookupFactory.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookupFactory.java 1969-12-31 19:00:00.000000000 -0500 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookupFactory.java 2011-05-22 18:00:18.000000000 -0400 @@ -0,0 +1,35 @@ +package org.apache.solr.spelling.suggest.tst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.tst.TSTLookup; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.spelling.suggest.LookupFactory; + +/** + * Factory for {@link TSTLookup} + */ +public class TSTLookupFactory extends LookupFactory { + + @Override + public Lookup create(NamedList params, SolrCore core) { + return new TSTLookup(); + } +} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TernaryTreeNode.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TernaryTreeNode.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TernaryTreeNode.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TernaryTreeNode.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,25 +0,0 @@ -package org.apache.solr.spelling.suggest.tst; - -/** - * The class creates a TST node. - */ - -public class TernaryTreeNode { - /** the character stored by a node. */ - char splitchar; - /** a reference object to the node containing character smaller than this node's character. */ - TernaryTreeNode loKid; - /** - * a reference object to the node containing character next to this node's character as - * occurring in the inserted token. - */ - TernaryTreeNode eqKid; - /** a reference object to the node containing character higher than this node's character. */ - TernaryTreeNode hiKid; - /** - * used by leaf nodes to store the complete tokens to be added to suggest list while - * auto-completing the prefix. - */ - String token; - Object val; -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java lucene-trunk/solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,133 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.util; - -import java.io.IOException; -import java.util.Iterator; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.search.spell.Dictionary; -import org.apache.lucene.util.StringHelper; -import org.apache.lucene.util.BytesRef; - -/** - * HighFrequencyDictionary: terms taken from the given field - * of a Lucene index, which appear in a number of documents - * above a given threshold. - * - * Threshold is a value in [0..1] representing the minimum - * number of documents (of the total) where a term should appear. - * - * Based on LuceneDictionary. - */ -public class HighFrequencyDictionary implements Dictionary { - private IndexReader reader; - private String field; - private float thresh; - - public HighFrequencyDictionary(IndexReader reader, String field, float thresh) { - this.reader = reader; - this.field = StringHelper.intern(field); - this.thresh = thresh; - } - - public final Iterator getWordsIterator() { - return new HighFrequencyIterator(); - } - - final class HighFrequencyIterator implements TermFreqIterator, SortedIterator { - private TermsEnum termsEnum; - private BytesRef actualTerm; - private boolean hasNextCalled; - private int minNumDocs; - - HighFrequencyIterator() { - try { - Terms terms = MultiFields.getTerms(reader, field); - if (terms != null) { - termsEnum = terms.iterator(); - } - minNumDocs = (int)(thresh * (float)reader.numDocs()); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private boolean isFrequent(int freq) { - return freq >= minNumDocs; - } - - public float freq() { - try { - return termsEnum.docFreq(); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - } - - public String next() { - if (!hasNextCalled && !hasNext()) { - return null; - } - hasNextCalled = false; - - return (actualTerm != null) ? actualTerm.utf8ToString() : null; - } - - public boolean hasNext() { - if (hasNextCalled) { - return actualTerm != null; - } - hasNextCalled = true; - - if (termsEnum == null) { - return false; - } - - while(true) { - - try { - actualTerm = termsEnum.next(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - // if there are no words return false - if (actualTerm == null) { - return false; - } - - // got a valid term, does it pass the threshold? - try { - if (isFrequent(termsEnum.docFreq())) { - return true; - } - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - } - } - - public void remove() { - throw new UnsupportedOperationException(); - } - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/util/SortedIterator.java lucene-trunk/solr/src/java/org/apache/solr/util/SortedIterator.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/util/SortedIterator.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/util/SortedIterator.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,11 +0,0 @@ -package org.apache.solr.util; - -import java.util.Iterator; - -/** - * Marker interface to signal that elements coming from {@link Iterator} - * come in ascending lexicographic order. - */ -public interface SortedIterator { - -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/util/TermFreqIterator.java lucene-trunk/solr/src/java/org/apache/solr/util/TermFreqIterator.java --- lucene-clean-trunk/solr/src/java/org/apache/solr/util/TermFreqIterator.java 2011-05-22 12:37:52.000000000 -0400 +++ lucene-trunk/solr/src/java/org/apache/solr/util/TermFreqIterator.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,33 +0,0 @@ -package org.apache.solr.util; - -import java.util.Iterator; - -public interface TermFreqIterator extends Iterator { - - public float freq(); - - public static class TermFreqIteratorWrapper implements TermFreqIterator { - private Iterator wrapped; - - public TermFreqIteratorWrapper(Iterator wrapped) { - this.wrapped = wrapped; - } - - public float freq() { - return 1.0f; - } - - public boolean hasNext() { - return wrapped.hasNext(); - } - - public String next() { - return wrapped.next().toString(); - } - - public void remove() { - throw new UnsupportedOperationException(); - } - - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/Average.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/Average.java --- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/Average.java 2011-05-22 12:37:50.000000000 -0400 +++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/Average.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,52 +0,0 @@ -package org.apache.solr.spelling.suggest; - -import java.util.List; -import java.util.Locale; - -/** - * Average with standard deviation. - */ -final class Average -{ - /** - * Average (in milliseconds). - */ - public final double avg; - - /** - * Standard deviation (in milliseconds). - */ - public final double stddev; - - /** - * - */ - Average(double avg, double stddev) - { - this.avg = avg; - this.stddev = stddev; - } - - public String toString() - { - return String.format(Locale.ENGLISH, "%.0f [+- %.2f]", - avg, stddev); - } - - static Average from(List values) - { - double sum = 0; - double sumSquares = 0; - - for (double l : values) - { - sum += l; - sumSquares += l * l; - } - - double avg = sum / (double) values.size(); - return new Average( - (sum / (double) values.size()), - Math.sqrt(sumSquares / (double) values.size() - avg * avg)); - } -} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/LookupBenchmarkTest.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/LookupBenchmarkTest.java --- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/LookupBenchmarkTest.java 2011-05-22 12:37:50.000000000 -0400 +++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/LookupBenchmarkTest.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,230 +0,0 @@ -package org.apache.solr.spelling.suggest; - -import java.net.URL; -import java.util.Collections; -import java.util.List; -import java.util.Locale; -import java.util.Random; -import java.util.concurrent.Callable; - -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.solr.spelling.suggest.fst.FSTLookup; -import org.apache.solr.spelling.suggest.jaspell.JaspellLookup; -import org.apache.solr.spelling.suggest.tst.TSTLookup; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; - -import com.google.common.base.Charsets; -import com.google.common.base.Function; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.io.Resources; - -/** - * Benchmarks tests for implementations of {@link Lookup} interface. - */ -@Ignore // COMMENT ME TO RUN BENCHMARKS! -public class LookupBenchmarkTest { - @SuppressWarnings("unchecked") - private final List> benchmarkClasses = Lists.newArrayList( - JaspellLookup.class, - TSTLookup.class, - FSTLookup.class); - - private final static int rounds = 15; - private final static int warmup = 5; - - private final int num = 7; - private final boolean onlyMorePopular = true; - - private final static Random random = new Random(0xdeadbeef); - - /** - * Input term/weight pairs. - */ - private static TermFreq [] dictionaryInput; - - /** - * Benchmark term/weight pairs (randomized order). - */ - private static List benchmarkInput; - - /** - * Loads terms and frequencies from Wikipedia (cached). - */ - @BeforeClass - public static void setup() throws Exception { - List input = readTop50KWiki(); - Collections.shuffle(input, random); - LookupBenchmarkTest.dictionaryInput = input.toArray(new TermFreq [input.size()]); - Collections.shuffle(input, random); - LookupBenchmarkTest.benchmarkInput = input; - } - - /** - * Collect the multilingual input for benchmarks/ tests. - */ - public static List readTop50KWiki() throws Exception { - List input = Lists.newArrayList(); - URL resource = Thread.currentThread().getContextClassLoader().getResource("Top50KWiki.utf8"); - assert resource != null : "Resource missing: Top50KWiki.utf8"; - - for (String line : Resources.readLines(resource, Charsets.UTF_8)) { - int tab = line.indexOf('|'); - Assert.assertTrue("No | separator?: " + line, tab >= 0); - float weight = Float.parseFloat(line.substring(tab + 1)); - String key = line.substring(0, tab); - input.add(new TermFreq(key, weight)); - } - return input; - } - - /** - * Test construction time. - */ - @Test - public void testConstructionTime() throws Exception { - System.err.println("-- construction time"); - for (final Class cls : benchmarkClasses) { - BenchmarkResult result = measure(new Callable() { - public Integer call() throws Exception { - final Lookup lookup = buildLookup(cls, dictionaryInput); - return lookup.hashCode(); - } - }); - - System.err.println( - String.format(Locale.ENGLISH, "%-15s input: %d, time[ms]: %s", - cls.getSimpleName(), - dictionaryInput.length, - result.average.toString())); - } - } - - /** - * Test memory required for the storage. - */ - @Test - public void testStorageNeeds() throws Exception { - System.err.println("-- RAM consumption"); - final RamUsageEstimator rue = new RamUsageEstimator(); - for (Class cls : benchmarkClasses) { - Lookup lookup = buildLookup(cls, dictionaryInput); - System.err.println( - String.format(Locale.ENGLISH, "%-15s size[B]:%,13d", - lookup.getClass().getSimpleName(), - rue.estimateRamUsage(lookup))); - } - } - - /** - * Create {@link Lookup} instance and populate it. - */ - private Lookup buildLookup(Class cls, TermFreq[] input) throws Exception { - Lookup lookup = cls.newInstance(); - lookup.build(new TermFreqArrayIterator(input)); - return lookup; - } - - /** - * Test performance of lookup on full hits. - */ - @Test - public void testPerformanceOnFullHits() throws Exception { - final int minPrefixLen = 100; - final int maxPrefixLen = 200; - runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular); - } - - /** - * Test performance of lookup on longer term prefixes (6-9 letters or shorter). - */ - @Test - public void testPerformanceOnPrefixes6_9() throws Exception { - final int minPrefixLen = 6; - final int maxPrefixLen = 9; - runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular); - } - - /** - * Test performance of lookup on short term prefixes (2-4 letters or shorter). - */ - @Test - public void testPerformanceOnPrefixes2_4() throws Exception { - final int minPrefixLen = 2; - final int maxPrefixLen = 4; - runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular); - } - - /** - * Run the actual benchmark. - */ - public void runPerformanceTest(final int minPrefixLen, final int maxPrefixLen, - final int num, final boolean onlyMorePopular) throws Exception { - System.err.println(String.format(Locale.ENGLISH, - "-- prefixes: %d-%d, num: %d, onlyMorePopular: %s", - minPrefixLen, maxPrefixLen, num, onlyMorePopular)); - - for (Class cls : benchmarkClasses) { - final Lookup lookup = buildLookup(cls, dictionaryInput); - - final List input = Lists.newArrayList(Iterables.transform(benchmarkInput, new Function() { - public String apply(TermFreq tf) { - return tf.term.substring(0, Math.min(tf.term.length(), - minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1))); - } - })); - - BenchmarkResult result = measure(new Callable() { - public Integer call() throws Exception { - int v = 0; - for (String term : input) { - v += lookup.lookup(term, onlyMorePopular, num).size(); - } - return v; - } - }); - - System.err.println( - String.format(Locale.ENGLISH, "%-15s queries: %d, time[ms]: %s, ~qps: %.0f", - lookup.getClass().getSimpleName(), - input.size(), - result.average.toString(), - input.size() / result.average.avg)); - } - } - - /** - * Do the measurements. - */ - private BenchmarkResult measure(Callable callable) { - final double NANOS_PER_MS = 1000000; - - try { - List times = Lists.newArrayList(); - for (int i = 0; i < warmup + rounds; i++) { - final long start = System.nanoTime(); - guard = callable.call().intValue(); - times.add((System.nanoTime() - start) / NANOS_PER_MS); - } - return new BenchmarkResult(times, warmup, rounds); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - /** Guard against opts. */ - @SuppressWarnings("unused") - private static volatile int guard; - - private static class BenchmarkResult { - /** Average time per round (ms). */ - public final Average average; - - public BenchmarkResult(List times, int warmup, int rounds) { - this.average = Average.from(times.subList(warmup, times.size())); - } - } -} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/PersistenceTest.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/PersistenceTest.java --- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/PersistenceTest.java 2011-05-22 12:37:50.000000000 -0400 +++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/PersistenceTest.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,92 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.spelling.suggest; - -import java.io.File; - -import org.apache.solr.SolrTestCaseJ4; -import org.apache.solr.spelling.suggest.fst.FSTLookup; -import org.apache.solr.spelling.suggest.jaspell.JaspellLookup; -import org.apache.solr.spelling.suggest.tst.TSTLookup; -import org.junit.Test; - -public class PersistenceTest extends SolrTestCaseJ4 { - public final String[] keys = new String[] { - "one", - "two", - "three", - "four", - "oneness", - "onerous", - "onesimus", - "twofold", - "twonk", - "thrive", - "through", - "threat", - "foundation", - "fourier", - "fourty"}; - - @Test - public void testTSTPersistence() throws Exception { - runTest(TSTLookup.class, true); - } - - @Test - public void testJaspellPersistence() throws Exception { - runTest(JaspellLookup.class, true); - } - - @Test - public void testFSTPersistence() throws Exception { - runTest(FSTLookup.class, false); - } - - private void runTest(Class lookupClass, - boolean supportsExactWeights) throws Exception { - - // Add all input keys. - Lookup lookup = lookupClass.newInstance(); - TermFreq[] keys = new TermFreq[this.keys.length]; - for (int i = 0; i < keys.length; i++) - keys[i] = new TermFreq(this.keys[i], (float) i); - lookup.build(new TermFreqArrayIterator(keys)); - - // Store the suggester. - File storeDir = new File(TEST_HOME()); - lookup.store(storeDir); - - // Re-read it from disk. - lookup = lookupClass.newInstance(); - lookup.load(storeDir); - - // Assert validity. - float previous = Float.NEGATIVE_INFINITY; - for (TermFreq k : keys) { - Float val = (Float) lookup.get(k.term); - assertNotNull(k.term, val); - - if (supportsExactWeights) { - assertEquals(k.term, Float.valueOf(k.v), val); - } else { - assertTrue(val + ">=" + previous, val >= previous); - previous = val.floatValue(); - } - } - } -} diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreq.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreq.java --- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreq.java 2011-05-22 12:37:50.000000000 -0400 +++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreq.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,11 +0,0 @@ -package org.apache.solr.spelling.suggest; - -public final class TermFreq { - public final String term; - public final float v; - - public TermFreq(String term, float v) { - this.term = term; - this.v = v; - } -} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreqArrayIterator.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreqArrayIterator.java --- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreqArrayIterator.java 2011-05-22 12:37:50.000000000 -0400 +++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreqArrayIterator.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,40 +0,0 @@ -package org.apache.solr.spelling.suggest; - -import java.util.Arrays; -import java.util.Iterator; - -import org.apache.solr.util.TermFreqIterator; - -/** - * A {@link TermFreqIterator} over a sequence of {@link TermFreq}s. - */ -public final class TermFreqArrayIterator implements TermFreqIterator { - private final Iterator i; - private TermFreq current; - - public TermFreqArrayIterator(Iterator i) { - this.i = i; - } - - public TermFreqArrayIterator(TermFreq [] i) { - this(Arrays.asList(i)); - } - - public TermFreqArrayIterator(Iterable i) { - this(i.iterator()); - } - - public float freq() { - return current.v; - } - - public boolean hasNext() { - return i.hasNext(); - } - - public String next() { - return (current = i.next()).term; - } - - public void remove() { throw new UnsupportedOperationException(); } -} \ No newline at end of file diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/fst/FSTLookupTest.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/fst/FSTLookupTest.java --- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/fst/FSTLookupTest.java 2011-05-22 12:37:50.000000000 -0400 +++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/fst/FSTLookupTest.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,155 +0,0 @@ -package org.apache.solr.spelling.suggest.fst; - -import java.util.Arrays; -import java.util.List; -import java.util.Locale; -import java.util.Random; - -import org.apache.lucene.util.LuceneTestCase; -import org.apache.solr.spelling.suggest.Lookup.LookupResult; -import org.apache.solr.spelling.suggest.LookupBenchmarkTest; -import org.apache.solr.spelling.suggest.TermFreq; -import org.apache.solr.spelling.suggest.TermFreqArrayIterator; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import com.google.common.collect.Lists; - -/** - * Unit tests for {@link FSTLookup}. - */ -public class FSTLookupTest extends LuceneTestCase { - public static TermFreq tf(String t, float v) { - return new TermFreq(t, v); - } - - private FSTLookup lookup; - - @Before - public void prepare() throws Exception { - final TermFreq[] keys = new TermFreq[] { - tf("one", 0.5f), - tf("oneness", 1), - tf("onerous", 1), - tf("onesimus", 1), - tf("two", 1), - tf("twofold", 1), - tf("twonk", 1), - tf("thrive", 1), - tf("through", 1), - tf("threat", 1), - tf("three", 1), - tf("foundation", 1), - tf("fourier", 1), - tf("four", 1), - tf("fourty", 1), - tf("xo", 1), - }; - - lookup = new FSTLookup(); - lookup.build(new TermFreqArrayIterator(keys)); - } - - @Test - public void testExactMatchHighPriority() throws Exception { - assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0"); - } - - @Test - public void testExactMatchLowPriority() throws Exception { - assertMatchEquals(lookup.lookup("one", true, 2), - "one/0.0", - "oneness/1.0"); - } - - @Test - public void testMiss() throws Exception { - assertMatchEquals(lookup.lookup("xyz", true, 1)); - } - - @Test - public void testAlphabeticWithWeights() throws Exception { - assertEquals(0, lookup.lookup("xyz", false, 1).size()); - } - - @Test - public void testFullMatchList() throws Exception { - assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE), - "oneness/1.0", - "onerous/1.0", - "onesimus/1.0", - "one/0.0"); - } - - @Test - public void testMultilingualInput() throws Exception { - List input = LookupBenchmarkTest.readTop50KWiki(); - - lookup = new FSTLookup(); - lookup.build(new TermFreqArrayIterator(input)); - - for (TermFreq tf : input) { - assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null); - assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key); - } - } - - @Test - public void testEmptyInput() throws Exception { - lookup = new FSTLookup(); - lookup.build(new TermFreqArrayIterator(new TermFreq[0])); - - assertMatchEquals(lookup.lookup("", true, 10)); - } - - @Test - public void testRandom() throws Exception { - List freqs = Lists.newArrayList(); - Random rnd = random; - for (int i = 0; i < 5000; i++) { - freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100))); - } - lookup = new FSTLookup(); - lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()]))); - - for (TermFreq tf : freqs) { - final String term = tf.term; - for (int i = 1; i < term.length(); i++) { - String prefix = term.substring(0, i); - for (LookupResult lr : lookup.lookup(prefix, true, 10)) { - Assert.assertTrue(lr.key.startsWith(prefix)); - } - } - } - } - - private void assertMatchEquals(List res, String... expected) { - String [] result = new String [res.size()]; - for (int i = 0; i < res.size(); i++) - result[i] = res.get(i).toString(); - - if (!Arrays.equals(expected, result)) { - int colLen = Math.max(maxLen(expected), maxLen(result)); - - StringBuilder b = new StringBuilder(); - String format = "%" + colLen + "s " + "%" + colLen + "s\n"; - b.append(String.format(Locale.ENGLISH, format, "Expected", "Result")); - for (int i = 0; i < Math.max(result.length, expected.length); i++) { - b.append(String.format(Locale.ENGLISH, format, - i < expected.length ? expected[i] : "--", - i < result.length ? result[i] : "--")); - } - - System.err.println(b.toString()); - fail("Expected different output:\n" + b.toString()); - } - } - - private int maxLen(String[] result) { - int len = 0; - for (String s : result) - len = Math.max(len, s.length()); - return len; - } -}