diff -ruN -x .svn -x build lucene-clean-trunk/lucene/build.xml lucene-trunk/lucene/build.xml
--- lucene-clean-trunk/lucene/build.xml 2011-05-22 12:38:26.000000000 -0400
+++ lucene-trunk/lucene/build.xml 2011-05-22 18:53:18.000000000 -0400
@@ -231,7 +231,6 @@
-
@@ -256,7 +255,6 @@
-
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/build.xml lucene-trunk/lucene/contrib/spellchecker/build.xml
--- lucene-clean-trunk/lucene/contrib/spellchecker/build.xml 2011-05-22 12:38:17.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/build.xml 1969-12-31 19:00:00.000000000 -0500
@@ -1,43 +0,0 @@
-
-
-
-
-
-
-
- Spell Checker
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,35 +0,0 @@
-package org.apache.lucene.search.spell;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.Iterator;
-
-/**
- * A simple interface representing a Dictionary. A Dictionary
- * here is just a list of words.
- *
- *
- * @version 1.0
- */
-public interface Dictionary {
-
- /**
- * Return all words present in the dictionary
- * @return Iterator
- */
- Iterator getWordsIterator();
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,487 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.PriorityQueue;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.FuzzyTermsEnum;
-import org.apache.lucene.search.BoostAttribute;
-import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.LevenshteinAutomata;
-
-/**
- * Simple automaton-based spellchecker.
- *
- * Candidates are presented directly from the term dictionary, based on
- * Levenshtein distance. This is an alternative to {@link SpellChecker}
- * if you are using an edit-distance-like metric such as Levenshtein
- * or {@link JaroWinklerDistance}.
- *
- * A practical benefit of this spellchecker is that it requires no additional
- * datastructures (neither in RAM nor on disk) to do its work.
- *
- * @see LevenshteinAutomata
- * @see FuzzyTermsEnum
- *
- * @lucene.experimental
- */
-public class DirectSpellChecker {
- /** The default StringDistance, Levenshtein distance implemented internally
- * via {@link LevenshteinAutomata}.
- *
- * Note: this is the fastest distance metric, because Levenshtein is used
- * to draw candidates from the term dictionary: this just re-uses the scoring.
- *
- * Note also that this metric differs in subtle ways from {@link LevensteinDistance}:
- *
- *
This metric treats full unicode codepoints as characters, but
- * LevenshteinDistance calculates based on UTF-16 code units.
- *
This metric scales raw edit distances into a floating point score
- * differently than LevenshteinDistance: the scaling is based upon the
- * shortest of the two terms instead of the longest.
- *
- */
- public static final StringDistance INTERNAL_LEVENSHTEIN = new StringDistance() {
- public float getDistance(String s1, String s2) {
- throw new UnsupportedOperationException("Not for external use.");
- }};
-
- /** maximum edit distance for candidate terms */
- private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
- /** minimum prefix for candidate terms */
- private int minPrefix = 1;
- /** maximum number of top-N inspections per suggestion */
- private int maxInspections = 5;
- /** minimum accuracy for a term to match */
- private float accuracy = SpellChecker.DEFAULT_ACCURACY;
- /** value in [0..1] (or absolute number >=1) representing the minimum
- * number of documents (of the total) where a term should appear. */
- private float thresholdFrequency = 0f;
- /** minimum length of a query word to return suggestions */
- private int minQueryLength = 4;
- /** value in [0..1] (or absolute number >=1) representing the maximum
- * number of documents (of the total) a query term can appear in to
- * be corrected. */
- private float maxQueryFrequency = 0.01f;
- /** true if the spellchecker should lowercase terms */
- private boolean lowerCaseTerms = true;
- /** the comparator to use */
- private Comparator comparator = SuggestWordQueue.DEFAULT_COMPARATOR;
- /** the string distance to use */
- private StringDistance distance = INTERNAL_LEVENSHTEIN;
-
- /** Get the maximum number of Levenshtein edit-distances to draw
- * candidate terms from. */
- public int getMaxEdits() {
- return maxEdits;
- }
-
- /** Sets the maximum number of Levenshtein edit-distances to draw
- * candidate terms from. This value can be 1 or 2. The default is 2.
- *
- * Note: a large number of spelling errors occur with an edit distance
- * of 1, by setting this value to 1 you can increase both performance
- * and precision at the cost of recall.
- */
- public void setMaxEdits(int maxEdits) {
- if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
- throw new UnsupportedOperationException("Invalid maxEdits");
- this.maxEdits = maxEdits;
- }
-
- /**
- * Get the minimal number of characters that must match exactly
- */
- public int getMinPrefix() {
- return minPrefix;
- }
-
- /**
- * Sets the minimal number of initial characters (default: 1)
- * that must match exactly.
- *
- * This can improve both performance and accuracy of results,
- * as misspellings are commonly not the first character.
- */
- public void setMinPrefix(int minPrefix) {
- this.minPrefix = minPrefix;
- }
-
- /**
- * Get the maximum number of top-N inspections per suggestion
- */
- public int getMaxInspections() {
- return maxInspections;
- }
-
- /**
- * Set the maximum number of top-N inspections (default: 5) per suggestion.
- *
- * Increasing this number can improve the accuracy of results, at the cost
- * of performance.
- */
- public void setMaxInspections(int maxInspections) {
- this.maxInspections = maxInspections;
- }
-
- /**
- * Get the minimal accuracy from the StringDistance for a match
- */
- public float getAccuracy() {
- return accuracy;
- }
-
- /**
- * Set the minimal accuracy required (default: 0.5f) from a StringDistance
- * for a suggestion match.
- */
- public void setAccuracy(float accuracy) {
- this.accuracy = accuracy;
- }
-
- /**
- * Get the minimal threshold of documents a term must appear for a match
- */
- public float getThresholdFrequency() {
- return thresholdFrequency;
- }
-
- /**
- * Set the minimal threshold of documents a term must appear for a match.
- *
- * This can improve quality by only suggesting high-frequency terms. Note that
- * very high values might decrease performance slightly, by forcing the spellchecker
- * to draw more candidates from the term dictionary, but a practical value such
- * as 1 can be very useful towards improving quality.
- *
- * This can be specified as a relative percentage of documents such as 0.5f,
- * or it can be specified as an absolute whole document frequency, such as 4f.
- * Absolute document frequencies may not be fractional.
- */
- public void setThresholdFrequency(float thresholdFrequency) {
- if (thresholdFrequency >= 1f && thresholdFrequency != (int) thresholdFrequency)
- throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed");
- this.thresholdFrequency = thresholdFrequency;
- }
-
- /** Get the minimum length of a query term needed to return suggestions */
- public int getMinQueryLength() {
- return minQueryLength;
- }
-
- /**
- * Set the minimum length of a query term (default: 4) needed to return suggestions.
- *
- * Very short query terms will often cause only bad suggestions with any distance
- * metric.
- */
- public void setMinQueryLength(int minQueryLength) {
- this.minQueryLength = minQueryLength;
- }
-
- /**
- * Get the maximum threshold of documents a query term can appear in order
- * to provide suggestions.
- */
- public float getMaxQueryFrequency() {
- return maxQueryFrequency;
- }
-
- /**
- * Set the maximum threshold (default: 0.01f) of documents a query term can
- * appear in order to provide suggestions.
- *
- * Very high-frequency terms are typically spelled correctly. Additionally,
- * this can increase performance as it will do no work for the common case
- * of correctly-spelled input terms.
- *
- * This can be specified as a relative percentage of documents such as 0.5f,
- * or it can be specified as an absolute whole document frequency, such as 4f.
- * Absolute document frequencies may not be fractional.
- */
- public void setMaxQueryFrequency(float maxQueryFrequency) {
- if (maxQueryFrequency >= 1f && maxQueryFrequency != (int) maxQueryFrequency)
- throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed");
- this.maxQueryFrequency = maxQueryFrequency;
- }
-
- /** true if the spellchecker should lowercase terms */
- public boolean getLowerCaseTerms() {
- return lowerCaseTerms;
- }
-
- /**
- * True if the spellchecker should lowercase terms (default: true)
- *
- * This is a convenience method, if your index field has more complicated
- * analysis (such as StandardTokenizer removing punctuation), its probably
- * better to turn this off, and instead run your query terms through your
- * Analyzer first.
- *
- * If this option is not on, case differences count as an edit!
- */
- public void setLowerCaseTerms(boolean lowerCaseTerms) {
- this.lowerCaseTerms = lowerCaseTerms;
- }
-
- /**
- * Get the current comparator in use.
- */
- public Comparator getComparator() {
- return comparator;
- }
-
- /**
- * Set the comparator for sorting suggestions.
- * The default is {@link SuggestWordQueue#DEFAULT_COMPARATOR}
- */
- public void setComparator(Comparator comparator) {
- this.comparator = comparator;
- }
-
- /**
- * Get the string distance metric in use.
- */
- public StringDistance getDistance() {
- return distance;
- }
-
- /**
- * Set the string distance metric.
- * The default is {@link #INTERNAL_LEVENSHTEIN}
- *
- * Note: because this spellchecker draws its candidates from the
- * term dictionary using Levenshtein, it works best with an edit-distance-like
- * string metric. If you use a different metric than the default,
- * you might want to consider increasing {@link #setMaxInspections(int)}
- * to draw more candidates for your metric to rank.
- */
- public void setDistance(StringDistance distance) {
- this.distance = distance;
- }
-
- /**
- * Calls {@link #suggestSimilar(Term, int, IndexReader, boolean)
- * suggestSimilar(term, numSug, ir, false)}
- */
- public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir)
- throws IOException {
- return suggestSimilar(term, numSug, ir, false);
- }
-
- /**
- * Calls {@link #suggestSimilar(Term, int, IndexReader, boolean, float)
- * suggestSimilar(term, numSug, ir, morePopular, this.accuracy)}
- */
- public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
- boolean morePopular) throws IOException {
- return suggestSimilar(term, numSug, ir, morePopular, accuracy);
- }
-
- /**
- * Suggest similar words.
- *
- *
Unlike {@link SpellChecker}, the similarity used to fetch the most
- * relevant terms is an edit distance, therefore typically a low value
- * for numSug will work very well.
- *
- * @param term Term you want to spell check on
- * @param numSug the maximum number of suggested words
- * @param ir IndexReader to find terms from
- * @param morePopular return only suggested words that are as frequent or more frequent than the searched word
- * @param accuracy return only suggested words that match with this similarity
- * @return sorted list of the suggested words according to the comparator
- * @throws IOException
- */
- public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
- boolean morePopular, float accuracy) throws IOException {
-
- String text = term.text();
- if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
- return new SuggestWord[0];
-
- if (lowerCaseTerms)
- term = term.createTerm(text.toLowerCase(Locale.ENGLISH));
-
- int docfreq = ir.docFreq(term);
-
- // see line 341 of spellchecker. this is certainly very very nice for perf,
- // but is it really the right way to go?
- if (!morePopular && docfreq > 0) {
- return new SuggestWord[0];
- }
-
- int maxDoc = ir.maxDoc();
-
- if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) {
- return new SuggestWord[0];
- } else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float)maxDoc)) {
- return new SuggestWord[0];
- }
-
- if (!morePopular) docfreq = 0;
-
- if (thresholdFrequency >= 1f) {
- docfreq = Math.max(docfreq, (int) thresholdFrequency);
- } else if (thresholdFrequency > 0f) {
- docfreq = Math.max(docfreq, (int)(thresholdFrequency * (float)maxDoc)-1);
- }
-
- Collection terms = null;
- int inspections = numSug * maxInspections;
-
- // try ed=1 first, in case we get lucky
- terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy);
- if (maxEdits > 1 && terms.size() < inspections) {
- HashSet moreTerms = new HashSet();
- moreTerms.addAll(terms);
- moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy));
- terms = moreTerms;
- }
-
- // create the suggestword response, sort it, and trim it to size.
-
- SuggestWord suggestions[] = new SuggestWord[terms.size()];
- int index = suggestions.length - 1;
- for (ScoreTerm s : terms) {
- SuggestWord suggestion = new SuggestWord();
- suggestion.string = s.termAsString != null ? s.termAsString : s.term.utf8ToString();
- suggestion.score = s.score;
- suggestion.freq = s.docfreq;
- suggestions[index--] = suggestion;
- }
-
- ArrayUtil.mergeSort(suggestions, Collections.reverseOrder(comparator));
- if (numSug < suggestions.length) {
- SuggestWord trimmed[] = new SuggestWord[numSug];
- System.arraycopy(suggestions, 0, trimmed, 0, numSug);
- suggestions = trimmed;
- }
- return suggestions;
- }
-
- private Collection suggestSimilar(Term term, int numSug,
- IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException {
-
- AttributeSource atts = new AttributeSource();
- MaxNonCompetitiveBoostAttribute maxBoostAtt =
- atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
- FuzzyTermsEnum e = new FuzzyTermsEnum(MultiFields.getTerms(ir, term.field()).iterator(), atts, term, editDistance, Math.max(minPrefix, editDistance-1));
- final PriorityQueue stQueue = new PriorityQueue();
-
- BytesRef queryTerm = new BytesRef(term.text());
- BytesRef candidateTerm;
- ScoreTerm st = new ScoreTerm();
- BoostAttribute boostAtt =
- e.attributes().addAttribute(BoostAttribute.class);
- while ((candidateTerm = e.next()) != null) {
- final float boost = boostAtt.getBoost();
- // ignore uncompetitive hits
- if (stQueue.size() >= numSug && boost <= stQueue.peek().boost)
- continue;
-
- // ignore exact match of the same term
- if (queryTerm.bytesEquals(candidateTerm))
- continue;
-
- int df = e.docFreq();
-
- // check docFreq if required
- if (df <= docfreq)
- continue;
-
- final float score;
- final String termAsString;
- if (distance == INTERNAL_LEVENSHTEIN) {
- // delay creating strings until the end
- termAsString = null;
- // undo FuzzyTermsEnum's scale factor for a real scaled lev score
- score = boost / e.getScaleFactor() + e.getMinSimilarity();
- } else {
- termAsString = candidateTerm.utf8ToString();
- score = distance.getDistance(term.text(), termAsString);
- }
-
- if (score < accuracy)
- continue;
-
- // add new entry in PQ
- st.term = new BytesRef(candidateTerm);
- st.boost = boost;
- st.docfreq = df;
- st.termAsString = termAsString;
- st.score = score;
- stQueue.offer(st);
- // possibly drop entries from queue
- st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
- maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
- }
-
- return stQueue;
- }
-
- private static class ScoreTerm implements Comparable {
- public BytesRef term;
- public float boost;
- public int docfreq;
-
- public String termAsString;
- public float score;
-
- public int compareTo(ScoreTerm other) {
- if (term.bytesEquals(other.term))
- return 0; // consistent with equals
- if (this.boost == other.boost)
- return other.term.compareTo(this.term);
- else
- return Float.compare(this.boost, other.boost);
- }
-
- @Override
- public int hashCode() {
- final int prime = 31;
- int result = 1;
- result = prime * result + ((term == null) ? 0 : term.hashCode());
- return result;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj) return true;
- if (obj == null) return false;
- if (getClass() != obj.getClass()) return false;
- ScoreTerm other = (ScoreTerm) obj;
- if (term == null) {
- if (other.term != null) return false;
- } else if (!term.bytesEquals(other.term)) return false;
- return true;
- }
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,112 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.Arrays;
-
-public class JaroWinklerDistance implements StringDistance {
-
- private float threshold = 0.7f;
-
- private int[] matches(String s1, String s2) {
- String max, min;
- if (s1.length() > s2.length()) {
- max = s1;
- min = s2;
- } else {
- max = s2;
- min = s1;
- }
- int range = Math.max(max.length() / 2 - 1, 0);
- int[] matchIndexes = new int[min.length()];
- Arrays.fill(matchIndexes, -1);
- boolean[] matchFlags = new boolean[max.length()];
- int matches = 0;
- for (int mi = 0; mi < min.length(); mi++) {
- char c1 = min.charAt(mi);
- for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max
- .length()); xi < xn; xi++) {
- if (!matchFlags[xi] && c1 == max.charAt(xi)) {
- matchIndexes[mi] = xi;
- matchFlags[xi] = true;
- matches++;
- break;
- }
- }
- }
- char[] ms1 = new char[matches];
- char[] ms2 = new char[matches];
- for (int i = 0, si = 0; i < min.length(); i++) {
- if (matchIndexes[i] != -1) {
- ms1[si] = min.charAt(i);
- si++;
- }
- }
- for (int i = 0, si = 0; i < max.length(); i++) {
- if (matchFlags[i]) {
- ms2[si] = max.charAt(i);
- si++;
- }
- }
- int transpositions = 0;
- for (int mi = 0; mi < ms1.length; mi++) {
- if (ms1[mi] != ms2[mi]) {
- transpositions++;
- }
- }
- int prefix = 0;
- for (int mi = 0; mi < min.length(); mi++) {
- if (s1.charAt(mi) == s2.charAt(mi)) {
- prefix++;
- } else {
- break;
- }
- }
- return new int[] { matches, transpositions / 2, prefix, max.length() };
- }
-
- public float getDistance(String s1, String s2) {
- int[] mtp = matches(s1, s2);
- float m = mtp[0];
- if (m == 0) {
- return 0f;
- }
- float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
- float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
- * (1 - j);
- return jw;
- }
-
- /**
- * Sets the threshold used to determine when Winkler bonus should be used.
- * Set to a negative value to get the Jaro distance.
- * @param threshold the new value of the threshold
- */
- public void setThreshold(float threshold) {
- this.threshold = threshold;
- }
-
- /**
- * Returns the current value of the threshold used for adding the Winkler bonus.
- * The default value is 0.7.
- * @return the current value of the threshold
- */
- public float getThreshold() {
- return threshold;
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,109 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Levenstein edit distance class.
- */
-public final class LevensteinDistance implements StringDistance {
-
- /**
- * Optimized to run a bit faster than the static getDistance().
- * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
- */
- public LevensteinDistance () {
- }
-
-
- //*****************************
- // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
- //*****************************
- public float getDistance (String target, String other) {
- char[] sa;
- int n;
- int p[]; //'previous' cost array, horizontally
- int d[]; // cost array, horizontally
- int _d[]; //placeholder to assist in swapping p and d
-
- /*
- The difference between this impl. and the previous is that, rather
- than creating and retaining a matrix of size s.length()+1 by t.length()+1,
- we maintain two single-dimensional arrays of length s.length()+1. The first, d,
- is the 'current working' distance array that maintains the newest distance cost
- counts as we iterate through the characters of String s. Each time we increment
- the index of String t we are comparing, d is copied to p, the second int[]. Doing so
- allows us to retain the previous cost counts as required by the algorithm (taking
- the minimum of the cost count to the left, up one, and diagonally up and to the left
- of the current cost count being calculated). (Note that the arrays aren't really
- copied anymore, just switched...this is clearly much better than cloning an array
- or doing a System.arraycopy() each time through the outer loop.)
-
- Effectively, the difference between the two implementations is this one does not
- cause an out of memory condition when calculating the LD over two very large strings.
- */
-
- sa = target.toCharArray();
- n = sa.length;
- p = new int[n+1];
- d = new int[n+1];
-
- final int m = other.length();
- if (n == 0 || m == 0) {
- if (n == m) {
- return 1;
- }
- else {
- return 0;
- }
- }
-
-
- // indexes into strings s and t
- int i; // iterates through s
- int j; // iterates through t
-
- char t_j; // jth character of t
-
- int cost; // cost
-
- for (i = 0; i<=n; i++) {
- p[i] = i;
- }
-
- for (j = 1; j<=m; j++) {
- t_j = other.charAt(j-1);
- d[0] = j;
-
- for (i=1; i<=n; i++) {
- cost = sa[i-1]==t_j ? 0 : 1;
- // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
- d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost);
- }
-
- // copy current distance counts to 'previous row' distance counts
- _d = p;
- p = d;
- d = _d;
- }
-
- // our last action in the above loop was to switch d and p, so p now
- // actually has the most recent cost counts
- return 1.0f - ((float) p[n] / Math.max(other.length(), sa.length));
- }
-
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,96 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.index.IndexReader;
-
-import java.util.Iterator;
-
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.util.StringHelper;
-
-import java.io.*;
-
-/**
- * Lucene Dictionary: terms taken from the given field
- * of a Lucene index.
- *
- * When using IndexReader.terms(Term) the code must not call next() on TermEnum
- * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6
- *
- *
- *
- */
-public class LuceneDictionary implements Dictionary {
- private IndexReader reader;
- private String field;
-
- public LuceneDictionary(IndexReader reader, String field) {
- this.reader = reader;
- this.field = StringHelper.intern(field);
- }
-
- public final Iterator getWordsIterator() {
- return new LuceneIterator();
- }
-
-
- final class LuceneIterator implements Iterator {
- private TermsEnum termsEnum;
- private BytesRef pendingTerm;
-
- LuceneIterator() {
- try {
- final Terms terms = MultiFields.getTerms(reader, field);
- if (terms != null) {
- termsEnum = terms.iterator();
- pendingTerm = termsEnum.next();
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- public String next() {
- if (pendingTerm == null) {
- return null;
- }
-
- String result = pendingTerm.utf8ToString();
-
- try {
- pendingTerm = termsEnum.next();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- return result;
- }
-
- public boolean hasNext() {
- return pendingTerm != null;
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,144 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements. See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/**
- * N-Gram version of edit distance based on paper by Grzegorz Kondrak,
- * "N-gram similarity and distance". Proceedings of the Twelfth International
- * Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126,
- * Buenos Aires, Argentina, November 2005.
- * http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf
- *
- * This implementation uses the position-based optimization to compute partial
- * matches of n-gram sub-strings and adds a null-character prefix of size n-1
- * so that the first character is contained in the same number of n-grams as
- * a middle character. Null-character prefix matches are discounted so that
- * strings with no matching characters will return a distance of 0.
- *
- */
-public class NGramDistance implements StringDistance {
-
- private int n;
-
- /**
- * Creates an N-Gram distance measure using n-grams of the specified size.
- * @param size The size of the n-gram to be used to compute the string distance.
- */
- public NGramDistance(int size) {
- this.n = size;
- }
-
- /**
- * Creates an N-Gram distance measure using n-grams of size 2.
- */
- public NGramDistance() {
- this(2);
- }
-
- public float getDistance(String source, String target) {
- final int sl = source.length();
- final int tl = target.length();
-
- if (sl == 0 || tl == 0) {
- if (sl == tl) {
- return 1;
- }
- else {
- return 0;
- }
- }
-
- int cost = 0;
- if (sl < n || tl < n) {
- for (int i=0,ni=Math.min(sl,tl);iFormat allowed: 1 word per line:
- * word1
- * word2
- * word3
- */
-public class PlainTextDictionary implements Dictionary {
-
- private BufferedReader in;
- private String line;
- private boolean hasNextCalled;
-
- public PlainTextDictionary(File file) throws FileNotFoundException {
- in = new BufferedReader(new FileReader(file));
- }
-
- public PlainTextDictionary(InputStream dictFile) {
- in = new BufferedReader(new InputStreamReader(dictFile));
- }
-
- /**
- * Creates a dictionary based on a reader.
- */
- public PlainTextDictionary(Reader reader) {
- in = new BufferedReader(reader);
- }
-
- public Iterator getWordsIterator() {
- return new fileIterator();
- }
-
- final class fileIterator implements Iterator {
- public String next() {
- if (!hasNextCalled) {
- hasNext();
- }
- hasNextCalled = false;
- return line;
- }
-
- public boolean hasNext() {
- hasNextCalled = true;
- try {
- line = in.readLine();
- } catch (IOException ex) {
- throw new RuntimeException(ex);
- }
- return (line != null) ? true : false;
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,724 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.TieredMergePolicy;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.IndexWriterConfig.OpenMode;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.store.AlreadyClosedException;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.ReaderUtil;
-import org.apache.lucene.util.Version;
-
-/**
- *
- * Spell Checker class (Main class)
- * (initially inspired by the David Spencer code).
- *
- *
- *
Example Usage:
- *
- *
- * SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
- * // To index a field of a user index:
- * spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
- * // To index a file containing words:
- * spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
- * String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
- *
- *
- *
- * @version 1.0
- */
-public class SpellChecker implements java.io.Closeable {
-
- /**
- * The default minimum score to use, if not specified by calling {@link #setAccuracy(float)} .
- */
- public static final float DEFAULT_ACCURACY = 0.5f;
-
- /**
- * Field name for each word in the ngram index.
- */
- public static final String F_WORD = "word";
-
- private static final Term F_WORD_TERM = new Term(F_WORD);
-
- /**
- * the spell index
- */
- // don't modify the directory directly - see #swapSearcher()
- // TODO: why is this package private?
- Directory spellIndex;
- /**
- * Boost value for start and end grams
- */
- private float bStart = 2.0f;
-
- private float bEnd = 1.0f;
- // don't use this searcher directly - see #swapSearcher()
-
- private IndexSearcher searcher;
- /*
- * this locks all modifications to the current searcher.
- */
-
- private final Object searcherLock = new Object();
- /*
- * this lock synchronizes all possible modifications to the
- * current index directory. It should not be possible to try modifying
- * the same index concurrently. Note: Do not acquire the searcher lock
- * before acquiring this lock!
- */
- private final Object modifyCurrentIndexLock = new Object();
-
- private volatile boolean closed = false;
- // minimum score for hits generated by the spell checker query
-
- private float accuracy = DEFAULT_ACCURACY;
-
- private StringDistance sd;
- private Comparator comparator;
-
- /**
- * Use the given directory as a spell checker index. The directory
- * is created if it doesn't exist yet.
- * @param spellIndex the spell index directory
- * @param sd the {@link StringDistance} measurement to use
- * @throws IOException if Spellchecker can not open the directory
- */
- public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException {
- this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR);
- }
- /**
- * Use the given directory as a spell checker index with a
- * {@link LevensteinDistance} as the default {@link StringDistance}. The
- * directory is created if it doesn't exist yet.
- *
- * @param spellIndex
- * the spell index directory
- * @throws IOException
- * if spellchecker can not open the directory
- */
- public SpellChecker(Directory spellIndex) throws IOException {
- this(spellIndex, new LevensteinDistance());
- }
-
- /**
- * Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure
- * and the given {@link java.util.Comparator} for sorting the results.
- * @param spellIndex The spelling index
- * @param sd The distance
- * @param comparator The comparator
- * @throws IOException if there is a problem opening the index
- */
- public SpellChecker(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException {
- setSpellIndex(spellIndex);
- setStringDistance(sd);
- this.comparator = comparator;
- }
-
- /**
- * Use a different index as the spell checker index or re-open
- * the existing index if spellIndex is the same value
- * as given in the constructor.
- * @param spellIndexDir the spell directory to use
- * @throws AlreadyClosedException if the Spellchecker is already closed
- * @throws IOException if spellchecker can not open the directory
- */
- // TODO: we should make this final as it is called in the constructor
- public void setSpellIndex(Directory spellIndexDir) throws IOException {
- // this could be the same directory as the current spellIndex
- // modifications to the directory should be synchronized
- synchronized (modifyCurrentIndexLock) {
- ensureOpen();
- if (!IndexReader.indexExists(spellIndexDir)) {
- IndexWriter writer = new IndexWriter(spellIndexDir,
- new IndexWriterConfig(Version.LUCENE_CURRENT,
- new WhitespaceAnalyzer(Version.LUCENE_CURRENT)));
- writer.close();
- }
- swapSearcher(spellIndexDir);
- }
- }
-
- /**
- * Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}.
- * @param comparator the comparator
- */
- public void setComparator(Comparator comparator) {
- this.comparator = comparator;
- }
-
- public Comparator getComparator() {
- return comparator;
- }
-
- /**
- * Sets the {@link StringDistance} implementation for this
- * {@link SpellChecker} instance.
- *
- * @param sd the {@link StringDistance} implementation for this
- * {@link SpellChecker} instance
- */
- public void setStringDistance(StringDistance sd) {
- this.sd = sd;
- }
- /**
- * Returns the {@link StringDistance} instance used by this
- * {@link SpellChecker} instance.
- *
- * @return the {@link StringDistance} instance used by this
- * {@link SpellChecker} instance.
- */
- public StringDistance getStringDistance() {
- return sd;
- }
-
- /**
- * Sets the accuracy 0 < minScore < 1; default {@link #DEFAULT_ACCURACY}
- * @param acc The new accuracy
- */
- public void setAccuracy(float acc) {
- this.accuracy = acc;
- }
-
- /**
- * The accuracy (minimum score) to be used, unless overridden in {@link #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)}, to
- * decide whether a suggestion is included or not.
- * @return The current accuracy setting
- */
- public float getAccuracy() {
- return accuracy;
- }
-
- /**
- * Suggest similar words.
- *
- *
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
- * is not the same as the edit distance strategy used to calculate the best
- * matching spell-checked word from the hits that Lucene found, one usually has
- * to retrieve a couple of numSug's in order to get the true best match.
- *
- *
I.e. if numSug == 1, don't count on that suggestion being the best one.
- * Thus, you should set this value to at least 5 for a good suggestion.
- *
- * @param word the word you want a spell check done on
- * @param numSug the number of suggested words
- * @throws IOException if the underlying index throws an {@link IOException}
- * @throws AlreadyClosedException if the Spellchecker is already closed
- * @return String[]
- *
- * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
- */
- public String[] suggestSimilar(String word, int numSug) throws IOException {
- return this.suggestSimilar(word, numSug, null, null, false);
- }
-
- /**
- * Suggest similar words.
- *
- *
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
- * is not the same as the edit distance strategy used to calculate the best
- * matching spell-checked word from the hits that Lucene found, one usually has
- * to retrieve a couple of numSug's in order to get the true best match.
- *
- *
I.e. if numSug == 1, don't count on that suggestion being the best one.
- * Thus, you should set this value to at least 5 for a good suggestion.
- *
- * @param word the word you want a spell check done on
- * @param numSug the number of suggested words
- * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
- * @throws IOException if the underlying index throws an {@link IOException}
- * @throws AlreadyClosedException if the Spellchecker is already closed
- * @return String[]
- *
- * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
- */
- public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException {
- return this.suggestSimilar(word, numSug, null, null, false, accuracy);
- }
-
- /**
- * Suggest similar words (optionally restricted to a field of an index).
- *
- *
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
- * is not the same as the edit distance strategy used to calculate the best
- * matching spell-checked word from the hits that Lucene found, one usually has
- * to retrieve a couple of numSug's in order to get the true best match.
- *
- *
I.e. if numSug == 1, don't count on that suggestion being the best one.
- * Thus, you should set this value to at least 5 for a good suggestion.
- *
- *
Uses the {@link #getAccuracy()} value passed into the constructor as the accuracy.
- *
- * @param word the word you want a spell check done on
- * @param numSug the number of suggested words
- * @param ir the indexReader of the user index (can be null see field param)
- * @param field the field of the user index: if field is not null, the suggested
- * words are restricted to the words present in this field.
- * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word
- * (only if restricted mode = (indexReader!=null and field!=null)
- * @throws IOException if the underlying index throws an {@link IOException}
- * @throws AlreadyClosedException if the Spellchecker is already closed
- * @return String[] the sorted list of the suggest words with these 2 criteria:
- * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
- * of the suggest words in the field of the user index
- *
- * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
- */
- public String[] suggestSimilar(String word, int numSug, IndexReader ir,
- String field, boolean morePopular) throws IOException {
- return suggestSimilar(word, numSug, ir, field, morePopular, accuracy);
- }
-
-
- /**
- * Suggest similar words (optionally restricted to a field of an index).
- *
- *
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
- * is not the same as the edit distance strategy used to calculate the best
- * matching spell-checked word from the hits that Lucene found, one usually has
- * to retrieve a couple of numSug's in order to get the true best match.
- *
- *
I.e. if numSug == 1, don't count on that suggestion being the best one.
- * Thus, you should set this value to at least 5 for a good suggestion.
- *
- * @param word the word you want a spell check done on
- * @param numSug the number of suggested words
- * @param ir the indexReader of the user index (can be null see field param)
- * @param field the field of the user index: if field is not null, the suggested
- * words are restricted to the words present in this field.
- * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word
- * (only if restricted mode = (indexReader!=null and field!=null)
- * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
- * @throws IOException if the underlying index throws an {@link IOException}
- * @throws AlreadyClosedException if the Spellchecker is already closed
- * @return String[] the sorted list of the suggest words with these 2 criteria:
- * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
- * of the suggest words in the field of the user index
- */
- public String[] suggestSimilar(String word, int numSug, IndexReader ir,
- String field, boolean morePopular, float accuracy) throws IOException {
- // obtainSearcher calls ensureOpen
- final IndexSearcher indexSearcher = obtainSearcher();
- try{
-
- final int lengthWord = word.length();
-
- final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
- final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
- // if the word exists in the real index and we don't care for word frequency, return the word itself
- if (!morePopular && freq > 0) {
- return new String[] { word };
- }
-
- BooleanQuery query = new BooleanQuery();
- String[] grams;
- String key;
-
- for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {
-
- key = "gram" + ng; // form key
-
- grams = formGrams(word, ng); // form word into ngrams (allow dups too)
-
- if (grams.length == 0) {
- continue; // hmm
- }
-
- if (bStart > 0) { // should we boost prefixes?
- add(query, "start" + ng, grams[0], bStart); // matches start of word
-
- }
- if (bEnd > 0) { // should we boost suffixes
- add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word
-
- }
- for (int i = 0; i < grams.length; i++) {
- add(query, key, grams[i]);
- }
- }
-
- int maxHits = 10 * numSug;
-
- // System.out.println("Q: " + query);
- ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs;
- // System.out.println("HITS: " + hits.length());
- SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);
-
- // go thru more than 'maxr' matches in case the distance filter triggers
- int stop = Math.min(hits.length, maxHits);
- SuggestWord sugWord = new SuggestWord();
- for (int i = 0; i < stop; i++) {
-
- sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word
-
- // don't suggest a word for itself, that would be silly
- if (sugWord.string.equals(word)) {
- continue;
- }
-
- // edit distance
- sugWord.score = sd.getDistance(word,sugWord.string);
- if (sugWord.score < accuracy) {
- continue;
- }
-
- if (ir != null && field != null) { // use the user index
- sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
- // don't suggest a word that is not present in the field
- if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
- continue;
- }
- }
- sugQueue.insertWithOverflow(sugWord);
- if (sugQueue.size() == numSug) {
- // if queue full, maintain the minScore score
- accuracy = sugQueue.top().score;
- }
- sugWord = new SuggestWord();
- }
-
- // convert to array string
- String[] list = new String[sugQueue.size()];
- for (int i = sugQueue.size() - 1; i >= 0; i--) {
- list[i] = sugQueue.pop().string;
- }
-
- return list;
- } finally {
- releaseSearcher(indexSearcher);
- }
- }
- /**
- * Add a clause to a boolean query.
- */
- private static void add(BooleanQuery q, String name, String value, float boost) {
- Query tq = new TermQuery(new Term(name, value));
- tq.setBoost(boost);
- q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
- }
-
- /**
- * Add a clause to a boolean query.
- */
- private static void add(BooleanQuery q, String name, String value) {
- q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
- }
-
- /**
- * Form all ngrams for a given word.
- * @param text the word to parse
- * @param ng the ngram length e.g. 3
- * @return an array of all ngrams in the word and note that duplicates are not removed
- */
- private static String[] formGrams(String text, int ng) {
- int len = text.length();
- String[] res = new String[len - ng + 1];
- for (int i = 0; i < len - ng + 1; i++) {
- res[i] = text.substring(i, i + ng);
- }
- return res;
- }
-
- /**
- * Removes all terms from the spell check index.
- * @throws IOException
- * @throws AlreadyClosedException if the Spellchecker is already closed
- */
- public void clearIndex() throws IOException {
- synchronized (modifyCurrentIndexLock) {
- ensureOpen();
- final Directory dir = this.spellIndex;
- final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
- Version.LUCENE_CURRENT,
- new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
- .setOpenMode(OpenMode.CREATE));
- writer.close();
- swapSearcher(dir);
- }
- }
-
- /**
- * Check whether the word exists in the index.
- * @param word
- * @throws IOException
- * @throws AlreadyClosedException if the Spellchecker is already closed
- * @return true if the word exists in the index
- */
- public boolean exist(String word) throws IOException {
- // obtainSearcher calls ensureOpen
- final IndexSearcher indexSearcher = obtainSearcher();
- try{
- return indexSearcher.docFreq(F_WORD_TERM.createTerm(word)) > 0;
- } finally {
- releaseSearcher(indexSearcher);
- }
- }
-
- /**
- * Indexes the data from the given {@link Dictionary}.
- * @param dict Dictionary to index
- * @param mergeFactor mergeFactor to use when indexing
- * @param ramMB the max amount or memory in MB to use
- * @param optimize whether or not the spellcheck index should be optimized
- * @throws AlreadyClosedException if the Spellchecker is already closed
- * @throws IOException
- */
- public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB, boolean optimize) throws IOException {
- synchronized (modifyCurrentIndexLock) {
- ensureOpen();
- final Directory dir = this.spellIndex;
- final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)).setRAMBufferSizeMB(ramMB));
- ((TieredMergePolicy) writer.getConfig().getMergePolicy()).setMaxMergeAtOnce(mergeFactor);
- IndexSearcher indexSearcher = obtainSearcher();
- final List termsEnums = new ArrayList();
-
- if (searcher.maxDoc() > 0) {
- new ReaderUtil.Gather(searcher.getIndexReader()) {
- @Override
- protected void add(int base, IndexReader r) throws IOException {
- Terms terms = r.terms(F_WORD);
- if (terms != null)
- termsEnums.add(terms.iterator());
- }
- }.run();
- }
-
- boolean isEmpty = termsEnums.isEmpty();
-
- try {
- Iterator iter = dict.getWordsIterator();
- BytesRef currentTerm = new BytesRef();
-
- terms: while (iter.hasNext()) {
- String word = iter.next();
-
- int len = word.length();
- if (len < 3) {
- continue; // too short we bail but "too long" is fine...
- }
-
- if (!isEmpty) {
- // we have a non-empty index, check if the term exists
- currentTerm.copy(word);
- for (TermsEnum te : termsEnums) {
- if (te.seek(currentTerm, false) == TermsEnum.SeekStatus.FOUND) {
- continue terms;
- }
- }
- }
-
- // ok index the word
- Document doc = createDocument(word, getMin(len), getMax(len));
- writer.addDocument(doc);
- }
- } finally {
- releaseSearcher(indexSearcher);
- }
- // close writer
- if (optimize)
- writer.optimize();
- writer.close();
- // also re-open the spell index to see our own changes when the next suggestion
- // is fetched:
- swapSearcher(dir);
- }
- }
-
- /**
- * Indexes the data from the given {@link Dictionary}.
- * @param dict the dictionary to index
- * @param mergeFactor mergeFactor to use when indexing
- * @param ramMB the max amount or memory in MB to use
- * @throws IOException
- */
- public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException {
- indexDictionary(dict, mergeFactor, ramMB, true);
- }
-
- /**
- * Indexes the data from the given {@link Dictionary}.
- * @param dict the dictionary to index
- * @throws IOException
- */
- public final void indexDictionary(Dictionary dict) throws IOException {
- indexDictionary(dict, 300, (int)IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
- }
-
- private static int getMin(int l) {
- if (l > 5) {
- return 3;
- }
- if (l == 5) {
- return 2;
- }
- return 1;
- }
-
- private static int getMax(int l) {
- if (l > 5) {
- return 4;
- }
- if (l == 5) {
- return 3;
- }
- return 2;
- }
-
- private static Document createDocument(String text, int ng1, int ng2) {
- Document doc = new Document();
- // the word field is never queried on... its indexed so it can be quickly
- // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
- Field f = new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
- f.setOmitTermFreqAndPositions(true);
- f.setOmitNorms(true);
- doc.add(f); // orig term
- addGram(text, doc, ng1, ng2);
- return doc;
- }
-
- private static void addGram(String text, Document doc, int ng1, int ng2) {
- int len = text.length();
- for (int ng = ng1; ng <= ng2; ng++) {
- String key = "gram" + ng;
- String end = null;
- for (int i = 0; i < len - ng + 1; i++) {
- String gram = text.substring(i, i + ng);
- doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
- if (i == 0) {
- // only one term possible in the startXXField, TF/pos and norms aren't needed.
- Field startField = new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED);
- startField.setOmitTermFreqAndPositions(true);
- startField.setOmitNorms(true);
- doc.add(startField);
- }
- end = gram;
- }
- if (end != null) { // may not be present if len==ng1
- // only one term possible in the endXXField, TF/pos and norms aren't needed.
- Field endField = new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED);
- endField.setOmitTermFreqAndPositions(true);
- endField.setOmitNorms(true);
- doc.add(endField);
- }
- }
- }
-
- private IndexSearcher obtainSearcher() {
- synchronized (searcherLock) {
- ensureOpen();
- searcher.getIndexReader().incRef();
- return searcher;
- }
- }
-
- private void releaseSearcher(final IndexSearcher aSearcher) throws IOException{
- // don't check if open - always decRef
- // don't decrement the private searcher - could have been swapped
- aSearcher.getIndexReader().decRef();
- }
-
- private void ensureOpen() {
- if (closed) {
- throw new AlreadyClosedException("Spellchecker has been closed");
- }
- }
-
- /**
- * Close the IndexSearcher used by this SpellChecker
- * @throws IOException if the close operation causes an {@link IOException}
- * @throws AlreadyClosedException if the {@link SpellChecker} is already closed
- */
- public void close() throws IOException {
- synchronized (searcherLock) {
- ensureOpen();
- closed = true;
- if (searcher != null) {
- searcher.close();
- }
- searcher = null;
- }
- }
-
- private void swapSearcher(final Directory dir) throws IOException {
- /*
- * opening a searcher is possibly very expensive.
- * We rather close it again if the Spellchecker was closed during
- * this operation than block access to the current searcher while opening.
- */
- final IndexSearcher indexSearcher = createSearcher(dir);
- synchronized (searcherLock) {
- if(closed){
- indexSearcher.close();
- throw new AlreadyClosedException("Spellchecker has been closed");
- }
- if (searcher != null) {
- searcher.close();
- }
- // set the spellindex in the sync block - ensure consistency.
- searcher = indexSearcher;
- this.spellIndex = dir;
- }
- }
-
- /**
- * Creates a new read-only IndexSearcher
- * @param dir the directory used to open the searcher
- * @return a new read-only IndexSearcher
- * @throws IOException f there is a low-level IO error
- */
- // for testing purposes
- IndexSearcher createSearcher(final Directory dir) throws IOException{
- return new IndexSearcher(dir, true);
- }
-
- /**
- * Returns true if and only if the {@link SpellChecker} is
- * closed, otherwise false.
- *
- * @return true if and only if the {@link SpellChecker} is
- * closed, otherwise false.
- */
- boolean isClosed(){
- return closed;
- }
-
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,35 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Interface for string distances.
- */
-public interface StringDistance {
-
- /**
- * Returns a float between 0 and 1 based on how similar the specified strings are to one another.
- * Returning a value of 1 means the specified strings are identical and 0 means the
- * string are maximally different.
- * @param s1 The first string.
- * @param s2 The second string.
- * @return a float between 0 and 1 based on how similar the specified strings are to one another.
- */
- public float getDistance(String s1,String s2);
-
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,45 +0,0 @@
-package org.apache.lucene.search.spell;
-
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * SuggestWord, used in suggestSimilar method in SpellChecker class.
- *
- * Default sort is first by score, then by frequency.
- *
- *
- */
-public final class SuggestWord{
-
- /**
- * the score of the word
- */
- public float score;
-
- /**
- * The freq of the word
- */
- public int freq;
-
- /**
- * the suggested word
- */
- public String string;
-
-}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,47 +0,0 @@
-package org.apache.lucene.search.spell;
-
-import java.util.Comparator;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-/**
- * Frequency first, then score. Must have
- *
- **/
-public class SuggestWordFrequencyComparator implements Comparator {
-
- public int compare(SuggestWord first, SuggestWord second) {
- // first criteria: the frequency
- if (first.freq > second.freq) {
- return 1;
- }
- if (first.freq < second.freq) {
- return -1;
- }
-
- // second criteria (if first criteria is equal): the score
- if (first.score > second.score) {
- return 1;
- }
- if (first.score < second.score) {
- return -1;
- }
- // third criteria: term text
- return second.string.compareTo(first.string);
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,63 +0,0 @@
-package org.apache.lucene.search.spell;
-
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.util.PriorityQueue;
-
-import java.util.Comparator;
-
-
-/**
- * Sorts SuggestWord instances
- *
- * @see org.apache.lucene.search.spell.SuggestWordScoreComparator
- * @see org.apache.lucene.search.spell.SuggestWordFrequencyComparator
- *
- */
-public final class SuggestWordQueue extends PriorityQueue {
- public static final Comparator DEFAULT_COMPARATOR = new SuggestWordScoreComparator();
-
-
- private Comparator comparator;
-
- /**
- * Use the {@link #DEFAULT_COMPARATOR}
- * @param size The size of the queue
- */
- public SuggestWordQueue (int size) {
- super(size);
- comparator = DEFAULT_COMPARATOR;
- }
-
- /**
- * Specify the size of the queue and the comparator to use for sorting.
- * @param size The size
- * @param comparator The comparator.
- */
- public SuggestWordQueue(int size, Comparator comparator){
- super(size);
- this.comparator = comparator;
- }
-
- @Override
- protected final boolean lessThan (SuggestWord wa, SuggestWord wb) {
- int val = comparator.compare(wa, wb);
- return val < 0;
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,47 +0,0 @@
-package org.apache.lucene.search.spell;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.Comparator;
-
-
-/**
- * Score first, then frequency
- *
- **/
-public class SuggestWordScoreComparator implements Comparator {
- public int compare(SuggestWord first, SuggestWord second) {
- // first criteria: the distance
- if (first.score > second.score) {
- return 1;
- }
- if (first.score < second.score) {
- return -1;
- }
-
- // second criteria (if first criteria is equal): the popularity
- if (first.freq > second.freq) {
- return 1;
- }
-
- if (first.freq < second.freq) {
- return -1;
- }
- // third criteria: term text
- return second.string.compareTo(first.string);
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,144 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.RandomIndexWriter;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.English;
-import org.apache.lucene.util.LuceneTestCase;
-
-public class TestDirectSpellChecker extends LuceneTestCase {
-
- public void testSimpleExamples() throws Exception {
- DirectSpellChecker spellChecker = new DirectSpellChecker();
- spellChecker.setMinQueryLength(0);
- Directory dir = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, dir,
- new MockAnalyzer(random, MockTokenizer.SIMPLE, true));
-
- for (int i = 0; i < 20; i++) {
- Document doc = new Document();
- doc.add(newField("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED));
- writer.addDocument(doc);
- }
-
- IndexReader ir = writer.getReader();
-
- SuggestWord[] similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false);
- assertTrue(similar.length > 0);
- assertEquals("five", similar[0].string);
-
- similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir, false);
- if (similar.length > 0) {
- assertFalse(similar[0].string.equals("five")); // don't suggest a word for itself
- }
-
- similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false);
- assertTrue(similar.length > 0);
- assertEquals("five", similar[0].string);
-
- similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir, false);
- assertTrue(similar.length > 0);
- assertEquals("five", similar[0].string);
-
- similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir, false);
- assertTrue(similar.length > 0);
- assertEquals("five", similar[0].string);
-
- assertTrue(similar.length > 0);
- similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir, false);
- assertEquals("five", similar[0].string);
-
- // add some more documents
- for (int i = 1000; i < 1100; i++) {
- Document doc = new Document();
- doc.add(newField("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED));
- writer.addDocument(doc);
- }
-
- ir.close();
- ir = writer.getReader();
-
- // look ma, no spellcheck index rebuild
- similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10, ir, false);
- assertTrue(similar.length > 0);
- assertEquals("thousand", similar[0].string);
-
- ir.close();
- writer.close();
- dir.close();
- }
-
- public void testOptions() throws Exception {
- Directory dir = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, dir,
- new MockAnalyzer(random, MockTokenizer.SIMPLE, true));
-
- Document doc = new Document();
- doc.add(newField("text", "foobar", Field.Store.NO, Field.Index.ANALYZED));
- writer.addDocument(doc);
- doc.add(newField("text", "foobar", Field.Store.NO, Field.Index.ANALYZED));
- writer.addDocument(doc);
- doc.add(newField("text", "foobaz", Field.Store.NO, Field.Index.ANALYZED));
- writer.addDocument(doc);
- doc.add(newField("text", "fobar", Field.Store.NO, Field.Index.ANALYZED));
- writer.addDocument(doc);
-
- IndexReader ir = writer.getReader();
-
- DirectSpellChecker spellChecker = new DirectSpellChecker();
- spellChecker.setMaxQueryFrequency(0F);
- SuggestWord[] similar = spellChecker.suggestSimilar(new Term("text", "fobar"), 1, ir, true);
- assertEquals(0, similar.length);
-
- spellChecker = new DirectSpellChecker(); // reset defaults
- spellChecker.setMinQueryLength(5);
- similar = spellChecker.suggestSimilar(new Term("text", "foba"), 1, ir, true);
- assertEquals(0, similar.length);
-
- spellChecker = new DirectSpellChecker(); // reset defaults
- spellChecker.setMaxEdits(1);
- similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true);
- assertEquals(0, similar.length);
-
- spellChecker = new DirectSpellChecker(); // reset defaults
- spellChecker.setAccuracy(0.9F);
- similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true);
- assertEquals(0, similar.length);
-
- spellChecker = new DirectSpellChecker(); // reset defaults
- spellChecker.setMinPrefix(0);
- similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true);
- assertEquals(1, similar.length);
-
- spellChecker = new DirectSpellChecker(); // reset defaults
- spellChecker.setMinPrefix(1);
- similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true);
- assertEquals(0, similar.length);
-
- ir.close();
- writer.close();
- dir.close();
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,49 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.util.LuceneTestCase;
-
-public class TestJaroWinklerDistance extends LuceneTestCase {
-
- private StringDistance sd = new JaroWinklerDistance();
-
- public void testGetDistance() {
- float d = sd.getDistance("al", "al");
- assertTrue(d == 1.0f);
- d = sd.getDistance("martha", "marhta");
- assertTrue(d > 0.961 && d <0.962);
- d = sd.getDistance("jones", "johnson");
- assertTrue(d > 0.832 && d < 0.833);
- d = sd.getDistance("abcvwxyz", "cabvwxyz");
- assertTrue(d > 0.958 && d < 0.959);
- d = sd.getDistance("dwayne", "duane");
- assertTrue(d > 0.84 && d < 0.841);
- d = sd.getDistance("dixon", "dicksonx");
- assertTrue(d > 0.813 && d < 0.814);
- d = sd.getDistance("fvie", "ten");
- assertTrue(d == 0f);
- float d1 = sd.getDistance("zac ephron", "zac efron");
- float d2 = sd.getDistance("zac ephron", "kai ephron");
- assertTrue(d1 > d2);
- d1 = sd.getDistance("brittney spears", "britney spears");
- d2 = sd.getDistance("brittney spears", "brittney startzman");
- assertTrue(d1 > d2);
- }
-
-}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,54 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.util.LuceneTestCase;
-
-public class TestLevenshteinDistance extends LuceneTestCase {
-
- private StringDistance sd = new LevensteinDistance();
-
- public void testGetDistance() {
- float d = sd.getDistance("al", "al");
- assertEquals(d,1.0f,0.001);
- d = sd.getDistance("martha", "marhta");
- assertEquals(d,0.6666,0.001);
- d = sd.getDistance("jones", "johnson");
- assertEquals(d,0.4285,0.001);
- d = sd.getDistance("abcvwxyz", "cabvwxyz");
- assertEquals(d,0.75,0.001);
- d = sd.getDistance("dwayne", "duane");
- assertEquals(d,0.666,0.001);
- d = sd.getDistance("dixon", "dicksonx");
- assertEquals(d,0.5,0.001);
- d = sd.getDistance("six", "ten");
- assertEquals(d,0,0.001);
- float d1 = sd.getDistance("zac ephron", "zac efron");
- float d2 = sd.getDistance("zac ephron", "kai ephron");
- assertEquals(d1,d2,0.001);
- d1 = sd.getDistance("brittney spears", "britney spears");
- d2 = sd.getDistance("brittney spears", "brittney startzman");
- assertTrue(d1 > d2);
- }
-
- public void testEmpty() throws Exception {
- float d = sd.getDistance("", "al");
- assertEquals(d,0.0f,0.001);
- }
-
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,210 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
-
-/**
- * Test case for LuceneDictionary.
- * It first creates a simple index and then a couple of instances of LuceneDictionary
- * on different fields and checks if all the right text comes back.
- */
-public class TestLuceneDictionary extends LuceneTestCase {
-
- private Directory store;
-
- private IndexReader indexReader = null;
- private LuceneDictionary ld;
- private Iterator it;
-
- @Override
- public void setUp() throws Exception {
- super.setUp();
- store = newDirectory();
- IndexWriter writer = new IndexWriter(store, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
-
- Document doc;
-
- doc = new Document();
- doc.add(newField("aaa", "foo", Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
-
- doc = new Document();
- doc.add(newField("aaa", "foo", Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
-
- doc = new Document();
- doc.add(new Field("contents", "Tom", Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
-
- doc = new Document();
- doc.add(new Field("contents", "Jerry", Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
-
- doc = new Document();
- doc.add(newField("zzz", "bar", Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
-
- writer.optimize();
- writer.close();
- }
-
- @Override
- public void tearDown() throws Exception {
- if (indexReader != null)
- indexReader.close();
- store.close();
- super.tearDown();
- }
-
- public void testFieldNonExistent() throws IOException {
- try {
- indexReader = IndexReader.open(store, true);
-
- ld = new LuceneDictionary(indexReader, "nonexistent_field");
- it = ld.getWordsIterator();
-
- assertFalse("More elements than expected", it.hasNext());
- assertTrue("Nonexistent element is really null", it.next() == null);
- } finally {
- if (indexReader != null) { indexReader.close(); }
- }
- }
-
- public void testFieldAaa() throws IOException {
- try {
- indexReader = IndexReader.open(store, true);
-
- ld = new LuceneDictionary(indexReader, "aaa");
- it = ld.getWordsIterator();
-
- assertTrue("First element doesn't exist.", it.hasNext());
- assertTrue("First element isn't correct", it.next().equals("foo"));
- assertFalse("More elements than expected", it.hasNext());
- assertTrue("Nonexistent element is really null", it.next() == null);
- } finally {
- if (indexReader != null) { indexReader.close(); }
- }
- }
-
- public void testFieldContents_1() throws IOException {
- try {
- indexReader = IndexReader.open(store, true);
-
- ld = new LuceneDictionary(indexReader, "contents");
- it = ld.getWordsIterator();
-
- assertTrue("First element doesn't exist.", it.hasNext());
- assertTrue("First element isn't correct", it.next().equals("Jerry"));
- assertTrue("Second element doesn't exist.", it.hasNext());
- assertTrue("Second element isn't correct", it.next().equals("Tom"));
- assertFalse("More elements than expected", it.hasNext());
- assertTrue("Nonexistent element is really null", it.next() == null);
-
- ld = new LuceneDictionary(indexReader, "contents");
- it = ld.getWordsIterator();
-
- int counter = 2;
- while (it.hasNext()) {
- it.next();
- counter--;
- }
-
- assertTrue("Number of words incorrect", counter == 0);
- }
- finally {
- if (indexReader != null) { indexReader.close(); }
- }
- }
-
- public void testFieldContents_2() throws IOException {
- try {
- indexReader = IndexReader.open(store, true);
-
- ld = new LuceneDictionary(indexReader, "contents");
- it = ld.getWordsIterator();
-
- // hasNext() should have no side effects
- assertTrue("First element isn't were it should be.", it.hasNext());
- assertTrue("First element isn't were it should be.", it.hasNext());
- assertTrue("First element isn't were it should be.", it.hasNext());
-
- // just iterate through words
- assertTrue("First element isn't correct", it.next().equals("Jerry"));
- assertTrue("Second element isn't correct", it.next().equals("Tom"));
- assertTrue("Nonexistent element is really null", it.next() == null);
-
- // hasNext() should still have no side effects ...
- assertFalse("There should be any more elements", it.hasNext());
- assertFalse("There should be any more elements", it.hasNext());
- assertFalse("There should be any more elements", it.hasNext());
-
- // .. and there are really no more words
- assertTrue("Nonexistent element is really null", it.next() == null);
- assertTrue("Nonexistent element is really null", it.next() == null);
- assertTrue("Nonexistent element is really null", it.next() == null);
- }
- finally {
- if (indexReader != null) { indexReader.close(); }
- }
- }
-
- public void testFieldZzz() throws IOException {
- try {
- indexReader = IndexReader.open(store, true);
-
- ld = new LuceneDictionary(indexReader, "zzz");
- it = ld.getWordsIterator();
-
- assertTrue("First element doesn't exist.", it.hasNext());
- assertTrue("First element isn't correct", it.next().equals("bar"));
- assertFalse("More elements than expected", it.hasNext());
- assertTrue("Nonexistent element is really null", it.next() == null);
- }
- finally {
- if (indexReader != null) { indexReader.close(); }
- }
- }
-
- public void testSpellchecker() throws IOException {
- Directory dir = newDirectory();
- SpellChecker sc = new SpellChecker(dir);
- indexReader = IndexReader.open(store, true);
- sc.indexDictionary(new LuceneDictionary(indexReader, "contents"));
- String[] suggestions = sc.suggestSimilar("Tam", 1);
- assertEquals(1, suggestions.length);
- assertEquals("Tom", suggestions[0]);
- suggestions = sc.suggestSimilar("Jarry", 1);
- assertEquals(1, suggestions.length);
- assertEquals("Jerry", suggestions[0]);
- indexReader.close();
- sc.close();
- dir.close();
- }
-
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,132 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.util.LuceneTestCase;
-
-public class TestNGramDistance extends LuceneTestCase {
-
-
-
- public void testGetDistance1() {
- StringDistance nsd = new NGramDistance(1);
- float d = nsd.getDistance("al", "al");
- assertEquals(d,1.0f,0.001);
- d = nsd.getDistance("a", "a");
- assertEquals(d,1.0f,0.001);
- d = nsd.getDistance("b", "a");
- assertEquals(d,0.0f,0.001);
- d = nsd.getDistance("martha", "marhta");
- assertEquals(d,0.6666,0.001);
- d = nsd.getDistance("jones", "johnson");
- assertEquals(d,0.4285,0.001);
- d = nsd.getDistance("natural", "contrary");
- assertEquals(d,0.25,0.001);
- d = nsd.getDistance("abcvwxyz", "cabvwxyz");
- assertEquals(d,0.75,0.001);
- d = nsd.getDistance("dwayne", "duane");
- assertEquals(d,0.666,0.001);
- d = nsd.getDistance("dixon", "dicksonx");
- assertEquals(d,0.5,0.001);
- d = nsd.getDistance("six", "ten");
- assertEquals(d,0,0.001);
- float d1 = nsd.getDistance("zac ephron", "zac efron");
- float d2 = nsd.getDistance("zac ephron", "kai ephron");
- assertEquals(d1,d2,0.001);
- d1 = nsd.getDistance("brittney spears", "britney spears");
- d2 = nsd.getDistance("brittney spears", "brittney startzman");
- assertTrue(d1 > d2);
- d1 = nsd.getDistance("12345678", "12890678");
- d2 = nsd.getDistance("12345678", "72385698");
- assertEquals(d1,d2,001);
- }
-
- public void testGetDistance2() {
- StringDistance sd = new NGramDistance(2);
- float d = sd.getDistance("al", "al");
- assertEquals(d,1.0f,0.001);
- d = sd.getDistance("a", "a");
- assertEquals(d,1.0f,0.001);
- d = sd.getDistance("b", "a");
- assertEquals(d,0.0f,0.001);
- d = sd.getDistance("a", "aa");
- assertEquals(d,0.5f,0.001);
- d = sd.getDistance("martha", "marhta");
- assertEquals(d,0.6666,0.001);
- d = sd.getDistance("jones", "johnson");
- assertEquals(d,0.4285,0.001);
- d = sd.getDistance("natural", "contrary");
- assertEquals(d,0.25,0.001);
- d = sd.getDistance("abcvwxyz", "cabvwxyz");
- assertEquals(d,0.625,0.001);
- d = sd.getDistance("dwayne", "duane");
- assertEquals(d,0.5833,0.001);
- d = sd.getDistance("dixon", "dicksonx");
- assertEquals(d,0.5,0.001);
- d = sd.getDistance("six", "ten");
- assertEquals(d,0,0.001);
- float d1 = sd.getDistance("zac ephron", "zac efron");
- float d2 = sd.getDistance("zac ephron", "kai ephron");
- assertTrue(d1 > d2);
- d1 = sd.getDistance("brittney spears", "britney spears");
- d2 = sd.getDistance("brittney spears", "brittney startzman");
- assertTrue(d1 > d2);
- d1 = sd.getDistance("0012345678", "0012890678");
- d2 = sd.getDistance("0012345678", "0072385698");
- assertEquals(d1,d2,0.001);
- }
-
- public void testGetDistance3() {
- StringDistance sd = new NGramDistance(3);
- float d = sd.getDistance("al", "al");
- assertEquals(d,1.0f,0.001);
- d = sd.getDistance("a", "a");
- assertEquals(d,1.0f,0.001);
- d = sd.getDistance("b", "a");
- assertEquals(d,0.0f,0.001);
- d = sd.getDistance("martha", "marhta");
- assertEquals(d,0.7222,0.001);
- d = sd.getDistance("jones", "johnson");
- assertEquals(d,0.4762,0.001);
- d = sd.getDistance("natural", "contrary");
- assertEquals(d,0.2083,0.001);
- d = sd.getDistance("abcvwxyz", "cabvwxyz");
- assertEquals(d,0.5625,0.001);
- d = sd.getDistance("dwayne", "duane");
- assertEquals(d,0.5277,0.001);
- d = sd.getDistance("dixon", "dicksonx");
- assertEquals(d,0.4583,0.001);
- d = sd.getDistance("six", "ten");
- assertEquals(d,0,0.001);
- float d1 = sd.getDistance("zac ephron", "zac efron");
- float d2 = sd.getDistance("zac ephron", "kai ephron");
- assertTrue(d1 > d2);
- d1 = sd.getDistance("brittney spears", "britney spears");
- d2 = sd.getDistance("brittney spears", "brittney startzman");
- assertTrue(d1 > d2);
- d1 = sd.getDistance("0012345678", "0012890678");
- d2 = sd.getDistance("0012345678", "0072385698");
- assertTrue(d1 < d2);
- }
-
- public void testEmpty() throws Exception {
- StringDistance nsd = new NGramDistance(1);
- float d = nsd.getDistance("", "al");
- assertEquals(d,0.0f,0.001);
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,47 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.StringReader;
-
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
-
-/**
- * Test case for PlainTextDictionary
- *
- */
-public class TestPlainTextDictionary extends LuceneTestCase {
-
- public void testBuild() throws IOException {
- final String LF = System.getProperty("line.separator");
- String input = "oneword" + LF + "twoword" + LF + "threeword";
- PlainTextDictionary ptd = new PlainTextDictionary(new StringReader(input));
- Directory ramDir = newDirectory();
- SpellChecker spellChecker = new SpellChecker(ramDir);
- spellChecker.indexDictionary(ptd);
- String[] similar = spellChecker.suggestSimilar("treeword", 2);
- assertEquals(2, similar.length);
- assertEquals(similar[0], "threeword");
- assertEquals(similar[1], "oneword");
- spellChecker.close();
- ramDir.close();
- }
-
-}
diff -ruN -x .svn -x build lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
--- lucene-clean-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java 2011-05-22 12:38:16.000000000 -0400
+++ lucene-trunk/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,438 +0,0 @@
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.store.AlreadyClosedException;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.English;
-import org.apache.lucene.util.LuceneTestCase;
-
-/**
- * Spell checker test case
- */
-public class TestSpellChecker extends LuceneTestCase {
- private SpellCheckerMock spellChecker;
- private Directory userindex, spellindex;
- private List searchers;
-
- @Override
- public void setUp() throws Exception {
- super.setUp();
-
- //create a user index
- userindex = newDirectory();
- IndexWriter writer = new IndexWriter(userindex, new IndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random)));
-
- for (int i = 0; i < 1000; i++) {
- Document doc = new Document();
- doc.add(newField("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
- doc.add(newField("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
- doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
- writer.addDocument(doc);
- }
- writer.close();
- searchers = Collections.synchronizedList(new ArrayList());
- // create the spellChecker
- spellindex = newDirectory();
- spellChecker = new SpellCheckerMock(spellindex);
- }
-
- @Override
- public void tearDown() throws Exception {
- userindex.close();
- if (!spellChecker.isClosed())
- spellChecker.close();
- spellindex.close();
- super.tearDown();
- }
-
-
- public void testBuild() throws CorruptIndexException, IOException {
- IndexReader r = IndexReader.open(userindex, true);
-
- spellChecker.clearIndex();
-
- addwords(r, spellChecker, "field1");
- int num_field1 = this.numdoc();
-
- addwords(r, spellChecker, "field2");
- int num_field2 = this.numdoc();
-
- assertEquals(num_field2, num_field1 + 1);
-
- assertLastSearcherOpen(4);
-
- checkCommonSuggestions(r);
- checkLevenshteinSuggestions(r);
-
- spellChecker.setStringDistance(new JaroWinklerDistance());
- spellChecker.setAccuracy(0.8f);
- checkCommonSuggestions(r);
- checkJaroWinklerSuggestions();
- // the accuracy is set to 0.8 by default, but the best result has a score of 0.925
- String[] similar = spellChecker.suggestSimilar("fvie", 2, 0.93f);
- assertTrue(similar.length == 0);
- similar = spellChecker.suggestSimilar("fvie", 2, 0.92f);
- assertTrue(similar.length == 1);
-
- similar = spellChecker.suggestSimilar("fiv", 2);
- assertTrue(similar.length > 0);
- assertEquals(similar[0], "five");
-
- spellChecker.setStringDistance(new NGramDistance(2));
- spellChecker.setAccuracy(0.5f);
- checkCommonSuggestions(r);
- checkNGramSuggestions();
-
- r.close();
- }
-
- public void testComparator() throws Exception {
- IndexReader r = IndexReader.open(userindex, true);
- Directory compIdx = newDirectory();
- SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
- addwords(r, compareSP, "field3");
-
- String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", false);
- assertTrue(similar.length == 2);
- //five and fvei have the same score, but different frequencies.
- assertEquals("fvei", similar[0]);
- assertEquals("five", similar[1]);
- r.close();
- if (!compareSP.isClosed())
- compareSP.close();
- compIdx.close();
- }
-
- private void checkCommonSuggestions(IndexReader r) throws IOException {
- String[] similar = spellChecker.suggestSimilar("fvie", 2);
- assertTrue(similar.length > 0);
- assertEquals(similar[0], "five");
-
- similar = spellChecker.suggestSimilar("five", 2);
- if (similar.length > 0) {
- assertFalse(similar[0].equals("five")); // don't suggest a word for itself
- }
-
- similar = spellChecker.suggestSimilar("fiv", 2);
- assertTrue(similar.length > 0);
- assertEquals(similar[0], "five");
-
- similar = spellChecker.suggestSimilar("fives", 2);
- assertTrue(similar.length > 0);
- assertEquals(similar[0], "five");
-
- assertTrue(similar.length > 0);
- similar = spellChecker.suggestSimilar("fie", 2);
- assertEquals(similar[0], "five");
-
- // test restraint to a field
- similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
- assertEquals(0, similar.length); // there isn't the term thousand in the field field1
-
- similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
- assertEquals(1, similar.length); // there is the term thousand in the field field2
- }
-
- private void checkLevenshteinSuggestions(IndexReader r) throws IOException {
- // test small word
- String[] similar = spellChecker.suggestSimilar("fvie", 2);
- assertEquals(1, similar.length);
- assertEquals(similar[0], "five");
-
- similar = spellChecker.suggestSimilar("five", 2);
- assertEquals(1, similar.length);
- assertEquals(similar[0], "nine"); // don't suggest a word for itself
-
- similar = spellChecker.suggestSimilar("fiv", 2);
- assertEquals(1, similar.length);
- assertEquals(similar[0], "five");
-
- similar = spellChecker.suggestSimilar("ive", 2);
- assertEquals(2, similar.length);
- assertEquals(similar[0], "five");
- assertEquals(similar[1], "nine");
-
- similar = spellChecker.suggestSimilar("fives", 2);
- assertEquals(1, similar.length);
- assertEquals(similar[0], "five");
-
- similar = spellChecker.suggestSimilar("fie", 2);
- assertEquals(2, similar.length);
- assertEquals(similar[0], "five");
- assertEquals(similar[1], "nine");
-
- similar = spellChecker.suggestSimilar("fi", 2);
- assertEquals(1, similar.length);
- assertEquals(similar[0], "five");
-
- // test restraint to a field
- similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
- assertEquals(0, similar.length); // there isn't the term thousand in the field field1
-
- similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
- assertEquals(1, similar.length); // there is the term thousand in the field field2
-
- similar = spellChecker.suggestSimilar("onety", 2);
- assertEquals(2, similar.length);
- assertEquals(similar[0], "ninety");
- assertEquals(similar[1], "one");
- try {
- similar = spellChecker.suggestSimilar("tousand", 10, r, null, false);
- } catch (NullPointerException e) {
- assertTrue("threw an NPE, and it shouldn't have", false);
- }
- }
-
- private void checkJaroWinklerSuggestions() throws IOException {
- String[] similar = spellChecker.suggestSimilar("onety", 2);
- assertEquals(2, similar.length);
- assertEquals(similar[0], "one");
- assertEquals(similar[1], "ninety");
- }
-
- private void checkNGramSuggestions() throws IOException {
- String[] similar = spellChecker.suggestSimilar("onety", 2);
- assertEquals(2, similar.length);
- assertEquals(similar[0], "one");
- assertEquals(similar[1], "ninety");
- }
-
- private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException {
- long time = System.currentTimeMillis();
- sc.indexDictionary(new LuceneDictionary(r, field));
- time = System.currentTimeMillis() - time;
- //System.out.println("time to build " + field + ": " + time);
- }
-
- private int numdoc() throws IOException {
- IndexReader rs = IndexReader.open(spellindex, true);
- int num = rs.numDocs();
- assertTrue(num != 0);
- //System.out.println("num docs: " + num);
- rs.close();
- return num;
- }
-
- public void testClose() throws IOException {
- IndexReader r = IndexReader.open(userindex, true);
- spellChecker.clearIndex();
- String field = "field1";
- addwords(r, spellChecker, "field1");
- int num_field1 = this.numdoc();
- addwords(r, spellChecker, "field2");
- int num_field2 = this.numdoc();
- assertEquals(num_field2, num_field1 + 1);
- checkCommonSuggestions(r);
- assertLastSearcherOpen(4);
- spellChecker.close();
- assertSearchersClosed();
- try {
- spellChecker.close();
- fail("spellchecker was already closed");
- } catch (AlreadyClosedException e) {
- // expected
- }
- try {
- checkCommonSuggestions(r);
- fail("spellchecker was already closed");
- } catch (AlreadyClosedException e) {
- // expected
- }
-
- try {
- spellChecker.clearIndex();
- fail("spellchecker was already closed");
- } catch (AlreadyClosedException e) {
- // expected
- }
-
- try {
- spellChecker.indexDictionary(new LuceneDictionary(r, field));
- fail("spellchecker was already closed");
- } catch (AlreadyClosedException e) {
- // expected
- }
-
- try {
- spellChecker.setSpellIndex(spellindex);
- fail("spellchecker was already closed");
- } catch (AlreadyClosedException e) {
- // expected
- }
- assertEquals(4, searchers.size());
- assertSearchersClosed();
- r.close();
- }
-
- /*
- * tests if the internally shared indexsearcher is correctly closed
- * when the spellchecker is concurrently accessed and closed.
- */
- public void testConcurrentAccess() throws IOException, InterruptedException {
- assertEquals(1, searchers.size());
- final IndexReader r = IndexReader.open(userindex, true);
- spellChecker.clearIndex();
- assertEquals(2, searchers.size());
- addwords(r, spellChecker, "field1");
- assertEquals(3, searchers.size());
- int num_field1 = this.numdoc();
- addwords(r, spellChecker, "field2");
- assertEquals(4, searchers.size());
- int num_field2 = this.numdoc();
- assertEquals(num_field2, num_field1 + 1);
- int numThreads = 5 + this.random.nextInt(5);
- ExecutorService executor = Executors.newFixedThreadPool(numThreads);
- SpellCheckWorker[] workers = new SpellCheckWorker[numThreads];
- for (int i = 0; i < numThreads; i++) {
- SpellCheckWorker spellCheckWorker = new SpellCheckWorker(r);
- executor.execute(spellCheckWorker);
- workers[i] = spellCheckWorker;
-
- }
- int iterations = 5 + random.nextInt(5);
- for (int i = 0; i < iterations; i++) {
- Thread.sleep(100);
- // concurrently reset the spell index
- spellChecker.setSpellIndex(this.spellindex);
- // for debug - prints the internal open searchers
- // showSearchersOpen();
- }
-
- spellChecker.close();
- executor.shutdown();
- // wait for 60 seconds - usually this is very fast but coverage runs could take quite long
- executor.awaitTermination(60L, TimeUnit.SECONDS);
-
- for (int i = 0; i < workers.length; i++) {
- assertFalse(String.format("worker thread %d failed", i), workers[i].failed);
- assertTrue(String.format("worker thread %d is still running but should be terminated", i), workers[i].terminated);
- }
- // 4 searchers more than iterations
- // 1. at creation
- // 2. clearIndex()
- // 2. and 3. during addwords
- assertEquals(iterations + 4, searchers.size());
- assertSearchersClosed();
- r.close();
- }
-
- private void assertLastSearcherOpen(int numSearchers) {
- assertEquals(numSearchers, searchers.size());
- IndexSearcher[] searcherArray = searchers.toArray(new IndexSearcher[0]);
- for (int i = 0; i < searcherArray.length; i++) {
- if (i == searcherArray.length - 1) {
- assertTrue("expected last searcher open but was closed",
- searcherArray[i].getIndexReader().getRefCount() > 0);
- } else {
- assertFalse("expected closed searcher but was open - Index: " + i,
- searcherArray[i].getIndexReader().getRefCount() > 0);
- }
- }
- }
-
- private void assertSearchersClosed() {
- for (IndexSearcher searcher : searchers) {
- assertEquals(0, searcher.getIndexReader().getRefCount());
- }
- }
-
- // For debug
-// private void showSearchersOpen() {
-// int count = 0;
-// for (IndexSearcher searcher : searchers) {
-// if(searcher.getIndexReader().getRefCount() > 0)
-// ++count;
-// }
-// System.out.println(count);
-// }
-
-
- private class SpellCheckWorker implements Runnable {
- private final IndexReader reader;
- volatile boolean terminated = false;
- volatile boolean failed = false;
-
- SpellCheckWorker(IndexReader reader) {
- super();
- this.reader = reader;
- }
-
- public void run() {
- try {
- while (true) {
- try {
- checkCommonSuggestions(reader);
- } catch (AlreadyClosedException e) {
-
- return;
- } catch (Throwable e) {
-
- e.printStackTrace();
- failed = true;
- return;
- }
- }
- } finally {
- terminated = true;
- }
- }
-
- }
-
- class SpellCheckerMock extends SpellChecker {
- public SpellCheckerMock(Directory spellIndex) throws IOException {
- super(spellIndex);
- }
-
- public SpellCheckerMock(Directory spellIndex, StringDistance sd)
- throws IOException {
- super(spellIndex, sd);
- }
-
- public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException {
- super(spellIndex, sd, comparator);
- }
-
- @Override
- IndexSearcher createSearcher(Directory dir) throws IOException {
- IndexSearcher searcher = super.createSearcher(dir);
- TestSpellChecker.this.searchers.add(searcher);
- return searcher;
- }
- }
-
-}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/build.xml lucene-trunk/modules/build.xml
--- lucene-clean-trunk/modules/build.xml 2011-05-22 12:38:11.000000000 -0400
+++ lucene-trunk/modules/build.xml 2011-05-22 19:07:14.000000000 -0400
@@ -25,6 +25,7 @@
+
@@ -35,6 +36,7 @@
+
@@ -45,6 +47,7 @@
+
@@ -55,6 +58,7 @@
+
@@ -66,6 +70,7 @@
+
@@ -96,6 +101,7 @@
+
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/build.xml lucene-trunk/modules/suggest/build.xml
--- lucene-clean-trunk/modules/suggest/build.xml 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/build.xml 2011-05-22 18:58:21.000000000 -0400
@@ -0,0 +1,47 @@
+
+
+
+
+
+
+
+ Suggest
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,35 @@
+package org.apache.lucene.search.spell;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Iterator;
+
+/**
+ * A simple interface representing a Dictionary. A Dictionary
+ * here is just a list of words.
+ *
+ *
+ * @version 1.0
+ */
+public interface Dictionary {
+
+ /**
+ * Return all words present in the dictionary
+ * @return Iterator
+ */
+ Iterator getWordsIterator();
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,487 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.PriorityQueue;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.FuzzyTermsEnum;
+import org.apache.lucene.search.BoostAttribute;
+import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.LevenshteinAutomata;
+
+/**
+ * Simple automaton-based spellchecker.
+ *
+ * Candidates are presented directly from the term dictionary, based on
+ * Levenshtein distance. This is an alternative to {@link SpellChecker}
+ * if you are using an edit-distance-like metric such as Levenshtein
+ * or {@link JaroWinklerDistance}.
+ *
+ * A practical benefit of this spellchecker is that it requires no additional
+ * datastructures (neither in RAM nor on disk) to do its work.
+ *
+ * @see LevenshteinAutomata
+ * @see FuzzyTermsEnum
+ *
+ * @lucene.experimental
+ */
+public class DirectSpellChecker {
+ /** The default StringDistance, Levenshtein distance implemented internally
+ * via {@link LevenshteinAutomata}.
+ *
+ * Note: this is the fastest distance metric, because Levenshtein is used
+ * to draw candidates from the term dictionary: this just re-uses the scoring.
+ *
+ * Note also that this metric differs in subtle ways from {@link LevensteinDistance}:
+ *
+ *
This metric treats full unicode codepoints as characters, but
+ * LevenshteinDistance calculates based on UTF-16 code units.
+ *
This metric scales raw edit distances into a floating point score
+ * differently than LevenshteinDistance: the scaling is based upon the
+ * shortest of the two terms instead of the longest.
+ *
+ */
+ public static final StringDistance INTERNAL_LEVENSHTEIN = new StringDistance() {
+ public float getDistance(String s1, String s2) {
+ throw new UnsupportedOperationException("Not for external use.");
+ }};
+
+ /** maximum edit distance for candidate terms */
+ private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
+ /** minimum prefix for candidate terms */
+ private int minPrefix = 1;
+ /** maximum number of top-N inspections per suggestion */
+ private int maxInspections = 5;
+ /** minimum accuracy for a term to match */
+ private float accuracy = SpellChecker.DEFAULT_ACCURACY;
+ /** value in [0..1] (or absolute number >=1) representing the minimum
+ * number of documents (of the total) where a term should appear. */
+ private float thresholdFrequency = 0f;
+ /** minimum length of a query word to return suggestions */
+ private int minQueryLength = 4;
+ /** value in [0..1] (or absolute number >=1) representing the maximum
+ * number of documents (of the total) a query term can appear in to
+ * be corrected. */
+ private float maxQueryFrequency = 0.01f;
+ /** true if the spellchecker should lowercase terms */
+ private boolean lowerCaseTerms = true;
+ /** the comparator to use */
+ private Comparator comparator = SuggestWordQueue.DEFAULT_COMPARATOR;
+ /** the string distance to use */
+ private StringDistance distance = INTERNAL_LEVENSHTEIN;
+
+ /** Get the maximum number of Levenshtein edit-distances to draw
+ * candidate terms from. */
+ public int getMaxEdits() {
+ return maxEdits;
+ }
+
+ /** Sets the maximum number of Levenshtein edit-distances to draw
+ * candidate terms from. This value can be 1 or 2. The default is 2.
+ *
+ * Note: a large number of spelling errors occur with an edit distance
+ * of 1, by setting this value to 1 you can increase both performance
+ * and precision at the cost of recall.
+ */
+ public void setMaxEdits(int maxEdits) {
+ if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
+ throw new UnsupportedOperationException("Invalid maxEdits");
+ this.maxEdits = maxEdits;
+ }
+
+ /**
+ * Get the minimal number of characters that must match exactly
+ */
+ public int getMinPrefix() {
+ return minPrefix;
+ }
+
+ /**
+ * Sets the minimal number of initial characters (default: 1)
+ * that must match exactly.
+ *
+ * This can improve both performance and accuracy of results,
+ * as misspellings are commonly not the first character.
+ */
+ public void setMinPrefix(int minPrefix) {
+ this.minPrefix = minPrefix;
+ }
+
+ /**
+ * Get the maximum number of top-N inspections per suggestion
+ */
+ public int getMaxInspections() {
+ return maxInspections;
+ }
+
+ /**
+ * Set the maximum number of top-N inspections (default: 5) per suggestion.
+ *
+ * Increasing this number can improve the accuracy of results, at the cost
+ * of performance.
+ */
+ public void setMaxInspections(int maxInspections) {
+ this.maxInspections = maxInspections;
+ }
+
+ /**
+ * Get the minimal accuracy from the StringDistance for a match
+ */
+ public float getAccuracy() {
+ return accuracy;
+ }
+
+ /**
+ * Set the minimal accuracy required (default: 0.5f) from a StringDistance
+ * for a suggestion match.
+ */
+ public void setAccuracy(float accuracy) {
+ this.accuracy = accuracy;
+ }
+
+ /**
+ * Get the minimal threshold of documents a term must appear for a match
+ */
+ public float getThresholdFrequency() {
+ return thresholdFrequency;
+ }
+
+ /**
+ * Set the minimal threshold of documents a term must appear for a match.
+ *
+ * This can improve quality by only suggesting high-frequency terms. Note that
+ * very high values might decrease performance slightly, by forcing the spellchecker
+ * to draw more candidates from the term dictionary, but a practical value such
+ * as 1 can be very useful towards improving quality.
+ *
+ * This can be specified as a relative percentage of documents such as 0.5f,
+ * or it can be specified as an absolute whole document frequency, such as 4f.
+ * Absolute document frequencies may not be fractional.
+ */
+ public void setThresholdFrequency(float thresholdFrequency) {
+ if (thresholdFrequency >= 1f && thresholdFrequency != (int) thresholdFrequency)
+ throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed");
+ this.thresholdFrequency = thresholdFrequency;
+ }
+
+ /** Get the minimum length of a query term needed to return suggestions */
+ public int getMinQueryLength() {
+ return minQueryLength;
+ }
+
+ /**
+ * Set the minimum length of a query term (default: 4) needed to return suggestions.
+ *
+ * Very short query terms will often cause only bad suggestions with any distance
+ * metric.
+ */
+ public void setMinQueryLength(int minQueryLength) {
+ this.minQueryLength = minQueryLength;
+ }
+
+ /**
+ * Get the maximum threshold of documents a query term can appear in order
+ * to provide suggestions.
+ */
+ public float getMaxQueryFrequency() {
+ return maxQueryFrequency;
+ }
+
+ /**
+ * Set the maximum threshold (default: 0.01f) of documents a query term can
+ * appear in order to provide suggestions.
+ *
+ * Very high-frequency terms are typically spelled correctly. Additionally,
+ * this can increase performance as it will do no work for the common case
+ * of correctly-spelled input terms.
+ *
+ * This can be specified as a relative percentage of documents such as 0.5f,
+ * or it can be specified as an absolute whole document frequency, such as 4f.
+ * Absolute document frequencies may not be fractional.
+ */
+ public void setMaxQueryFrequency(float maxQueryFrequency) {
+ if (maxQueryFrequency >= 1f && maxQueryFrequency != (int) maxQueryFrequency)
+ throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed");
+ this.maxQueryFrequency = maxQueryFrequency;
+ }
+
+ /** true if the spellchecker should lowercase terms */
+ public boolean getLowerCaseTerms() {
+ return lowerCaseTerms;
+ }
+
+ /**
+ * True if the spellchecker should lowercase terms (default: true)
+ *
+ * This is a convenience method, if your index field has more complicated
+ * analysis (such as StandardTokenizer removing punctuation), its probably
+ * better to turn this off, and instead run your query terms through your
+ * Analyzer first.
+ *
+ * If this option is not on, case differences count as an edit!
+ */
+ public void setLowerCaseTerms(boolean lowerCaseTerms) {
+ this.lowerCaseTerms = lowerCaseTerms;
+ }
+
+ /**
+ * Get the current comparator in use.
+ */
+ public Comparator getComparator() {
+ return comparator;
+ }
+
+ /**
+ * Set the comparator for sorting suggestions.
+ * The default is {@link SuggestWordQueue#DEFAULT_COMPARATOR}
+ */
+ public void setComparator(Comparator comparator) {
+ this.comparator = comparator;
+ }
+
+ /**
+ * Get the string distance metric in use.
+ */
+ public StringDistance getDistance() {
+ return distance;
+ }
+
+ /**
+ * Set the string distance metric.
+ * The default is {@link #INTERNAL_LEVENSHTEIN}
+ *
+ * Note: because this spellchecker draws its candidates from the
+ * term dictionary using Levenshtein, it works best with an edit-distance-like
+ * string metric. If you use a different metric than the default,
+ * you might want to consider increasing {@link #setMaxInspections(int)}
+ * to draw more candidates for your metric to rank.
+ */
+ public void setDistance(StringDistance distance) {
+ this.distance = distance;
+ }
+
+ /**
+ * Calls {@link #suggestSimilar(Term, int, IndexReader, boolean)
+ * suggestSimilar(term, numSug, ir, false)}
+ */
+ public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir)
+ throws IOException {
+ return suggestSimilar(term, numSug, ir, false);
+ }
+
+ /**
+ * Calls {@link #suggestSimilar(Term, int, IndexReader, boolean, float)
+ * suggestSimilar(term, numSug, ir, morePopular, this.accuracy)}
+ */
+ public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
+ boolean morePopular) throws IOException {
+ return suggestSimilar(term, numSug, ir, morePopular, accuracy);
+ }
+
+ /**
+ * Suggest similar words.
+ *
+ *
Unlike {@link SpellChecker}, the similarity used to fetch the most
+ * relevant terms is an edit distance, therefore typically a low value
+ * for numSug will work very well.
+ *
+ * @param term Term you want to spell check on
+ * @param numSug the maximum number of suggested words
+ * @param ir IndexReader to find terms from
+ * @param morePopular return only suggested words that are as frequent or more frequent than the searched word
+ * @param accuracy return only suggested words that match with this similarity
+ * @return sorted list of the suggested words according to the comparator
+ * @throws IOException
+ */
+ public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
+ boolean morePopular, float accuracy) throws IOException {
+
+ String text = term.text();
+ if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
+ return new SuggestWord[0];
+
+ if (lowerCaseTerms)
+ term = term.createTerm(text.toLowerCase(Locale.ENGLISH));
+
+ int docfreq = ir.docFreq(term);
+
+ // see line 341 of spellchecker. this is certainly very very nice for perf,
+ // but is it really the right way to go?
+ if (!morePopular && docfreq > 0) {
+ return new SuggestWord[0];
+ }
+
+ int maxDoc = ir.maxDoc();
+
+ if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) {
+ return new SuggestWord[0];
+ } else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float)maxDoc)) {
+ return new SuggestWord[0];
+ }
+
+ if (!morePopular) docfreq = 0;
+
+ if (thresholdFrequency >= 1f) {
+ docfreq = Math.max(docfreq, (int) thresholdFrequency);
+ } else if (thresholdFrequency > 0f) {
+ docfreq = Math.max(docfreq, (int)(thresholdFrequency * (float)maxDoc)-1);
+ }
+
+ Collection terms = null;
+ int inspections = numSug * maxInspections;
+
+ // try ed=1 first, in case we get lucky
+ terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy);
+ if (maxEdits > 1 && terms.size() < inspections) {
+ HashSet moreTerms = new HashSet();
+ moreTerms.addAll(terms);
+ moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy));
+ terms = moreTerms;
+ }
+
+ // create the suggestword response, sort it, and trim it to size.
+
+ SuggestWord suggestions[] = new SuggestWord[terms.size()];
+ int index = suggestions.length - 1;
+ for (ScoreTerm s : terms) {
+ SuggestWord suggestion = new SuggestWord();
+ suggestion.string = s.termAsString != null ? s.termAsString : s.term.utf8ToString();
+ suggestion.score = s.score;
+ suggestion.freq = s.docfreq;
+ suggestions[index--] = suggestion;
+ }
+
+ ArrayUtil.mergeSort(suggestions, Collections.reverseOrder(comparator));
+ if (numSug < suggestions.length) {
+ SuggestWord trimmed[] = new SuggestWord[numSug];
+ System.arraycopy(suggestions, 0, trimmed, 0, numSug);
+ suggestions = trimmed;
+ }
+ return suggestions;
+ }
+
+ private Collection suggestSimilar(Term term, int numSug,
+ IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException {
+
+ AttributeSource atts = new AttributeSource();
+ MaxNonCompetitiveBoostAttribute maxBoostAtt =
+ atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
+ FuzzyTermsEnum e = new FuzzyTermsEnum(MultiFields.getTerms(ir, term.field()).iterator(), atts, term, editDistance, Math.max(minPrefix, editDistance-1));
+ final PriorityQueue stQueue = new PriorityQueue();
+
+ BytesRef queryTerm = new BytesRef(term.text());
+ BytesRef candidateTerm;
+ ScoreTerm st = new ScoreTerm();
+ BoostAttribute boostAtt =
+ e.attributes().addAttribute(BoostAttribute.class);
+ while ((candidateTerm = e.next()) != null) {
+ final float boost = boostAtt.getBoost();
+ // ignore uncompetitive hits
+ if (stQueue.size() >= numSug && boost <= stQueue.peek().boost)
+ continue;
+
+ // ignore exact match of the same term
+ if (queryTerm.bytesEquals(candidateTerm))
+ continue;
+
+ int df = e.docFreq();
+
+ // check docFreq if required
+ if (df <= docfreq)
+ continue;
+
+ final float score;
+ final String termAsString;
+ if (distance == INTERNAL_LEVENSHTEIN) {
+ // delay creating strings until the end
+ termAsString = null;
+ // undo FuzzyTermsEnum's scale factor for a real scaled lev score
+ score = boost / e.getScaleFactor() + e.getMinSimilarity();
+ } else {
+ termAsString = candidateTerm.utf8ToString();
+ score = distance.getDistance(term.text(), termAsString);
+ }
+
+ if (score < accuracy)
+ continue;
+
+ // add new entry in PQ
+ st.term = new BytesRef(candidateTerm);
+ st.boost = boost;
+ st.docfreq = df;
+ st.termAsString = termAsString;
+ st.score = score;
+ stQueue.offer(st);
+ // possibly drop entries from queue
+ st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
+ maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
+ }
+
+ return stQueue;
+ }
+
+ private static class ScoreTerm implements Comparable {
+ public BytesRef term;
+ public float boost;
+ public int docfreq;
+
+ public String termAsString;
+ public float score;
+
+ public int compareTo(ScoreTerm other) {
+ if (term.bytesEquals(other.term))
+ return 0; // consistent with equals
+ if (this.boost == other.boost)
+ return other.term.compareTo(this.term);
+ else
+ return Float.compare(this.boost, other.boost);
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((term == null) ? 0 : term.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (obj == null) return false;
+ if (getClass() != obj.getClass()) return false;
+ ScoreTerm other = (ScoreTerm) obj;
+ if (term == null) {
+ if (other.term != null) return false;
+ } else if (!term.bytesEquals(other.term)) return false;
+ return true;
+ }
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java 2011-05-22 19:00:10.000000000 -0400
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.spell;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.search.spell.Dictionary;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * HighFrequencyDictionary: terms taken from the given field
+ * of a Lucene index, which appear in a number of documents
+ * above a given threshold.
+ *
+ * Threshold is a value in [0..1] representing the minimum
+ * number of documents (of the total) where a term should appear.
+ *
+ * Based on LuceneDictionary.
+ */
+public class HighFrequencyDictionary implements Dictionary {
+ private IndexReader reader;
+ private String field;
+ private float thresh;
+
+ public HighFrequencyDictionary(IndexReader reader, String field, float thresh) {
+ this.reader = reader;
+ this.field = StringHelper.intern(field);
+ this.thresh = thresh;
+ }
+
+ public final Iterator getWordsIterator() {
+ return new HighFrequencyIterator();
+ }
+
+ final class HighFrequencyIterator implements TermFreqIterator, SortedIterator {
+ private TermsEnum termsEnum;
+ private BytesRef actualTerm;
+ private boolean hasNextCalled;
+ private int minNumDocs;
+
+ HighFrequencyIterator() {
+ try {
+ Terms terms = MultiFields.getTerms(reader, field);
+ if (terms != null) {
+ termsEnum = terms.iterator();
+ }
+ minNumDocs = (int)(thresh * (float)reader.numDocs());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private boolean isFrequent(int freq) {
+ return freq >= minNumDocs;
+ }
+
+ public float freq() {
+ try {
+ return termsEnum.docFreq();
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+ }
+
+ public String next() {
+ if (!hasNextCalled && !hasNext()) {
+ return null;
+ }
+ hasNextCalled = false;
+
+ return (actualTerm != null) ? actualTerm.utf8ToString() : null;
+ }
+
+ public boolean hasNext() {
+ if (hasNextCalled) {
+ return actualTerm != null;
+ }
+ hasNextCalled = true;
+
+ if (termsEnum == null) {
+ return false;
+ }
+
+ while(true) {
+
+ try {
+ actualTerm = termsEnum.next();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ // if there are no words return false
+ if (actualTerm == null) {
+ return false;
+ }
+
+ // got a valid term, does it pass the threshold?
+ try {
+ if (isFrequent(termsEnum.docFreq())) {
+ return true;
+ }
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+ }
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,112 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+
+public class JaroWinklerDistance implements StringDistance {
+
+ private float threshold = 0.7f;
+
+ private int[] matches(String s1, String s2) {
+ String max, min;
+ if (s1.length() > s2.length()) {
+ max = s1;
+ min = s2;
+ } else {
+ max = s2;
+ min = s1;
+ }
+ int range = Math.max(max.length() / 2 - 1, 0);
+ int[] matchIndexes = new int[min.length()];
+ Arrays.fill(matchIndexes, -1);
+ boolean[] matchFlags = new boolean[max.length()];
+ int matches = 0;
+ for (int mi = 0; mi < min.length(); mi++) {
+ char c1 = min.charAt(mi);
+ for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max
+ .length()); xi < xn; xi++) {
+ if (!matchFlags[xi] && c1 == max.charAt(xi)) {
+ matchIndexes[mi] = xi;
+ matchFlags[xi] = true;
+ matches++;
+ break;
+ }
+ }
+ }
+ char[] ms1 = new char[matches];
+ char[] ms2 = new char[matches];
+ for (int i = 0, si = 0; i < min.length(); i++) {
+ if (matchIndexes[i] != -1) {
+ ms1[si] = min.charAt(i);
+ si++;
+ }
+ }
+ for (int i = 0, si = 0; i < max.length(); i++) {
+ if (matchFlags[i]) {
+ ms2[si] = max.charAt(i);
+ si++;
+ }
+ }
+ int transpositions = 0;
+ for (int mi = 0; mi < ms1.length; mi++) {
+ if (ms1[mi] != ms2[mi]) {
+ transpositions++;
+ }
+ }
+ int prefix = 0;
+ for (int mi = 0; mi < min.length(); mi++) {
+ if (s1.charAt(mi) == s2.charAt(mi)) {
+ prefix++;
+ } else {
+ break;
+ }
+ }
+ return new int[] { matches, transpositions / 2, prefix, max.length() };
+ }
+
+ public float getDistance(String s1, String s2) {
+ int[] mtp = matches(s1, s2);
+ float m = mtp[0];
+ if (m == 0) {
+ return 0f;
+ }
+ float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
+ float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
+ * (1 - j);
+ return jw;
+ }
+
+ /**
+ * Sets the threshold used to determine when Winkler bonus should be used.
+ * Set to a negative value to get the Jaro distance.
+ * @param threshold the new value of the threshold
+ */
+ public void setThreshold(float threshold) {
+ this.threshold = threshold;
+ }
+
+ /**
+ * Returns the current value of the threshold used for adding the Winkler bonus.
+ * The default value is 0.7.
+ * @return the current value of the threshold
+ */
+ public float getThreshold() {
+ return threshold;
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,109 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Levenstein edit distance class.
+ */
+public final class LevensteinDistance implements StringDistance {
+
+ /**
+ * Optimized to run a bit faster than the static getDistance().
+ * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
+ */
+ public LevensteinDistance () {
+ }
+
+
+ //*****************************
+ // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
+ //*****************************
+ public float getDistance (String target, String other) {
+ char[] sa;
+ int n;
+ int p[]; //'previous' cost array, horizontally
+ int d[]; // cost array, horizontally
+ int _d[]; //placeholder to assist in swapping p and d
+
+ /*
+ The difference between this impl. and the previous is that, rather
+ than creating and retaining a matrix of size s.length()+1 by t.length()+1,
+ we maintain two single-dimensional arrays of length s.length()+1. The first, d,
+ is the 'current working' distance array that maintains the newest distance cost
+ counts as we iterate through the characters of String s. Each time we increment
+ the index of String t we are comparing, d is copied to p, the second int[]. Doing so
+ allows us to retain the previous cost counts as required by the algorithm (taking
+ the minimum of the cost count to the left, up one, and diagonally up and to the left
+ of the current cost count being calculated). (Note that the arrays aren't really
+ copied anymore, just switched...this is clearly much better than cloning an array
+ or doing a System.arraycopy() each time through the outer loop.)
+
+ Effectively, the difference between the two implementations is this one does not
+ cause an out of memory condition when calculating the LD over two very large strings.
+ */
+
+ sa = target.toCharArray();
+ n = sa.length;
+ p = new int[n+1];
+ d = new int[n+1];
+
+ final int m = other.length();
+ if (n == 0 || m == 0) {
+ if (n == m) {
+ return 1;
+ }
+ else {
+ return 0;
+ }
+ }
+
+
+ // indexes into strings s and t
+ int i; // iterates through s
+ int j; // iterates through t
+
+ char t_j; // jth character of t
+
+ int cost; // cost
+
+ for (i = 0; i<=n; i++) {
+ p[i] = i;
+ }
+
+ for (j = 1; j<=m; j++) {
+ t_j = other.charAt(j-1);
+ d[0] = j;
+
+ for (i=1; i<=n; i++) {
+ cost = sa[i-1]==t_j ? 0 : 1;
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+ d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost);
+ }
+
+ // copy current distance counts to 'previous row' distance counts
+ _d = p;
+ p = d;
+ d = _d;
+ }
+
+ // our last action in the above loop was to switch d and p, so p now
+ // actually has the most recent cost counts
+ return 1.0f - ((float) p[n] / Math.max(other.length(), sa.length));
+ }
+
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,96 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+
+import java.util.Iterator;
+
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.util.StringHelper;
+
+import java.io.*;
+
+/**
+ * Lucene Dictionary: terms taken from the given field
+ * of a Lucene index.
+ *
+ * When using IndexReader.terms(Term) the code must not call next() on TermEnum
+ * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6
+ *
+ *
+ *
+ */
+public class LuceneDictionary implements Dictionary {
+ private IndexReader reader;
+ private String field;
+
+ public LuceneDictionary(IndexReader reader, String field) {
+ this.reader = reader;
+ this.field = StringHelper.intern(field);
+ }
+
+ public final Iterator getWordsIterator() {
+ return new LuceneIterator();
+ }
+
+
+ final class LuceneIterator implements Iterator {
+ private TermsEnum termsEnum;
+ private BytesRef pendingTerm;
+
+ LuceneIterator() {
+ try {
+ final Terms terms = MultiFields.getTerms(reader, field);
+ if (terms != null) {
+ termsEnum = terms.iterator();
+ pendingTerm = termsEnum.next();
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public String next() {
+ if (pendingTerm == null) {
+ return null;
+ }
+
+ String result = pendingTerm.utf8ToString();
+
+ try {
+ pendingTerm = termsEnum.next();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ return result;
+ }
+
+ public boolean hasNext() {
+ return pendingTerm != null;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/NGramDistance.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/NGramDistance.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/NGramDistance.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/NGramDistance.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,144 @@
+package org.apache.lucene.search.spell;
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/**
+ * N-Gram version of edit distance based on paper by Grzegorz Kondrak,
+ * "N-gram similarity and distance". Proceedings of the Twelfth International
+ * Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126,
+ * Buenos Aires, Argentina, November 2005.
+ * http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf
+ *
+ * This implementation uses the position-based optimization to compute partial
+ * matches of n-gram sub-strings and adds a null-character prefix of size n-1
+ * so that the first character is contained in the same number of n-grams as
+ * a middle character. Null-character prefix matches are discounted so that
+ * strings with no matching characters will return a distance of 0.
+ *
+ */
+public class NGramDistance implements StringDistance {
+
+ private int n;
+
+ /**
+ * Creates an N-Gram distance measure using n-grams of the specified size.
+ * @param size The size of the n-gram to be used to compute the string distance.
+ */
+ public NGramDistance(int size) {
+ this.n = size;
+ }
+
+ /**
+ * Creates an N-Gram distance measure using n-grams of size 2.
+ */
+ public NGramDistance() {
+ this(2);
+ }
+
+ public float getDistance(String source, String target) {
+ final int sl = source.length();
+ final int tl = target.length();
+
+ if (sl == 0 || tl == 0) {
+ if (sl == tl) {
+ return 1;
+ }
+ else {
+ return 0;
+ }
+ }
+
+ int cost = 0;
+ if (sl < n || tl < n) {
+ for (int i=0,ni=Math.min(sl,tl);iFormat allowed: 1 word per line:
+ * word1
+ * word2
+ * word3
+ */
+public class PlainTextDictionary implements Dictionary {
+
+ private BufferedReader in;
+ private String line;
+ private boolean hasNextCalled;
+
+ public PlainTextDictionary(File file) throws FileNotFoundException {
+ in = new BufferedReader(new FileReader(file));
+ }
+
+ public PlainTextDictionary(InputStream dictFile) {
+ in = new BufferedReader(new InputStreamReader(dictFile));
+ }
+
+ /**
+ * Creates a dictionary based on a reader.
+ */
+ public PlainTextDictionary(Reader reader) {
+ in = new BufferedReader(reader);
+ }
+
+ public Iterator getWordsIterator() {
+ return new fileIterator();
+ }
+
+ final class fileIterator implements Iterator {
+ public String next() {
+ if (!hasNextCalled) {
+ hasNext();
+ }
+ hasNextCalled = false;
+ return line;
+ }
+
+ public boolean hasNext() {
+ hasNextCalled = true;
+ try {
+ line = in.readLine();
+ } catch (IOException ex) {
+ throw new RuntimeException(ex);
+ }
+ return (line != null) ? true : false;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java 2011-05-22 16:52:15.000000000 -0400
@@ -0,0 +1,11 @@
+package org.apache.lucene.search.spell;
+
+import java.util.Iterator;
+
+/**
+ * Marker interface to signal that elements coming from {@link Iterator}
+ * come in ascending lexicographic order.
+ */
+public interface SortedIterator {
+
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,724 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.TieredMergePolicy;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.AlreadyClosedException;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.ReaderUtil;
+import org.apache.lucene.util.Version;
+
+/**
+ *
+ * Spell Checker class (Main class)
+ * (initially inspired by the David Spencer code).
+ *
+ *
+ *
Example Usage:
+ *
+ *
+ * SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
+ * // To index a field of a user index:
+ * spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
+ * // To index a file containing words:
+ * spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
+ * String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
+ *
+ *
+ *
+ * @version 1.0
+ */
+public class SpellChecker implements java.io.Closeable {
+
+ /**
+ * The default minimum score to use, if not specified by calling {@link #setAccuracy(float)} .
+ */
+ public static final float DEFAULT_ACCURACY = 0.5f;
+
+ /**
+ * Field name for each word in the ngram index.
+ */
+ public static final String F_WORD = "word";
+
+ private static final Term F_WORD_TERM = new Term(F_WORD);
+
+ /**
+ * the spell index
+ */
+ // don't modify the directory directly - see #swapSearcher()
+ // TODO: why is this package private?
+ Directory spellIndex;
+ /**
+ * Boost value for start and end grams
+ */
+ private float bStart = 2.0f;
+
+ private float bEnd = 1.0f;
+ // don't use this searcher directly - see #swapSearcher()
+
+ private IndexSearcher searcher;
+ /*
+ * this locks all modifications to the current searcher.
+ */
+
+ private final Object searcherLock = new Object();
+ /*
+ * this lock synchronizes all possible modifications to the
+ * current index directory. It should not be possible to try modifying
+ * the same index concurrently. Note: Do not acquire the searcher lock
+ * before acquiring this lock!
+ */
+ private final Object modifyCurrentIndexLock = new Object();
+
+ private volatile boolean closed = false;
+ // minimum score for hits generated by the spell checker query
+
+ private float accuracy = DEFAULT_ACCURACY;
+
+ private StringDistance sd;
+ private Comparator comparator;
+
+ /**
+ * Use the given directory as a spell checker index. The directory
+ * is created if it doesn't exist yet.
+ * @param spellIndex the spell index directory
+ * @param sd the {@link StringDistance} measurement to use
+ * @throws IOException if Spellchecker can not open the directory
+ */
+ public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException {
+ this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR);
+ }
+ /**
+ * Use the given directory as a spell checker index with a
+ * {@link LevensteinDistance} as the default {@link StringDistance}. The
+ * directory is created if it doesn't exist yet.
+ *
+ * @param spellIndex
+ * the spell index directory
+ * @throws IOException
+ * if spellchecker can not open the directory
+ */
+ public SpellChecker(Directory spellIndex) throws IOException {
+ this(spellIndex, new LevensteinDistance());
+ }
+
+ /**
+ * Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure
+ * and the given {@link java.util.Comparator} for sorting the results.
+ * @param spellIndex The spelling index
+ * @param sd The distance
+ * @param comparator The comparator
+ * @throws IOException if there is a problem opening the index
+ */
+ public SpellChecker(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException {
+ setSpellIndex(spellIndex);
+ setStringDistance(sd);
+ this.comparator = comparator;
+ }
+
+ /**
+ * Use a different index as the spell checker index or re-open
+ * the existing index if spellIndex is the same value
+ * as given in the constructor.
+ * @param spellIndexDir the spell directory to use
+ * @throws AlreadyClosedException if the Spellchecker is already closed
+ * @throws IOException if spellchecker can not open the directory
+ */
+ // TODO: we should make this final as it is called in the constructor
+ public void setSpellIndex(Directory spellIndexDir) throws IOException {
+ // this could be the same directory as the current spellIndex
+ // modifications to the directory should be synchronized
+ synchronized (modifyCurrentIndexLock) {
+ ensureOpen();
+ if (!IndexReader.indexExists(spellIndexDir)) {
+ IndexWriter writer = new IndexWriter(spellIndexDir,
+ new IndexWriterConfig(Version.LUCENE_CURRENT,
+ new WhitespaceAnalyzer(Version.LUCENE_CURRENT)));
+ writer.close();
+ }
+ swapSearcher(spellIndexDir);
+ }
+ }
+
+ /**
+ * Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}.
+ * @param comparator the comparator
+ */
+ public void setComparator(Comparator comparator) {
+ this.comparator = comparator;
+ }
+
+ public Comparator getComparator() {
+ return comparator;
+ }
+
+ /**
+ * Sets the {@link StringDistance} implementation for this
+ * {@link SpellChecker} instance.
+ *
+ * @param sd the {@link StringDistance} implementation for this
+ * {@link SpellChecker} instance
+ */
+ public void setStringDistance(StringDistance sd) {
+ this.sd = sd;
+ }
+ /**
+ * Returns the {@link StringDistance} instance used by this
+ * {@link SpellChecker} instance.
+ *
+ * @return the {@link StringDistance} instance used by this
+ * {@link SpellChecker} instance.
+ */
+ public StringDistance getStringDistance() {
+ return sd;
+ }
+
+ /**
+ * Sets the accuracy 0 < minScore < 1; default {@link #DEFAULT_ACCURACY}
+ * @param acc The new accuracy
+ */
+ public void setAccuracy(float acc) {
+ this.accuracy = acc;
+ }
+
+ /**
+ * The accuracy (minimum score) to be used, unless overridden in {@link #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)}, to
+ * decide whether a suggestion is included or not.
+ * @return The current accuracy setting
+ */
+ public float getAccuracy() {
+ return accuracy;
+ }
+
+ /**
+ * Suggest similar words.
+ *
+ *
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
+ * is not the same as the edit distance strategy used to calculate the best
+ * matching spell-checked word from the hits that Lucene found, one usually has
+ * to retrieve a couple of numSug's in order to get the true best match.
+ *
+ *
I.e. if numSug == 1, don't count on that suggestion being the best one.
+ * Thus, you should set this value to at least 5 for a good suggestion.
+ *
+ * @param word the word you want a spell check done on
+ * @param numSug the number of suggested words
+ * @throws IOException if the underlying index throws an {@link IOException}
+ * @throws AlreadyClosedException if the Spellchecker is already closed
+ * @return String[]
+ *
+ * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
+ */
+ public String[] suggestSimilar(String word, int numSug) throws IOException {
+ return this.suggestSimilar(word, numSug, null, null, false);
+ }
+
+ /**
+ * Suggest similar words.
+ *
+ *
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
+ * is not the same as the edit distance strategy used to calculate the best
+ * matching spell-checked word from the hits that Lucene found, one usually has
+ * to retrieve a couple of numSug's in order to get the true best match.
+ *
+ *
I.e. if numSug == 1, don't count on that suggestion being the best one.
+ * Thus, you should set this value to at least 5 for a good suggestion.
+ *
+ * @param word the word you want a spell check done on
+ * @param numSug the number of suggested words
+ * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
+ * @throws IOException if the underlying index throws an {@link IOException}
+ * @throws AlreadyClosedException if the Spellchecker is already closed
+ * @return String[]
+ *
+ * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
+ */
+ public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException {
+ return this.suggestSimilar(word, numSug, null, null, false, accuracy);
+ }
+
+ /**
+ * Suggest similar words (optionally restricted to a field of an index).
+ *
+ *
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
+ * is not the same as the edit distance strategy used to calculate the best
+ * matching spell-checked word from the hits that Lucene found, one usually has
+ * to retrieve a couple of numSug's in order to get the true best match.
+ *
+ *
I.e. if numSug == 1, don't count on that suggestion being the best one.
+ * Thus, you should set this value to at least 5 for a good suggestion.
+ *
+ *
Uses the {@link #getAccuracy()} value passed into the constructor as the accuracy.
+ *
+ * @param word the word you want a spell check done on
+ * @param numSug the number of suggested words
+ * @param ir the indexReader of the user index (can be null see field param)
+ * @param field the field of the user index: if field is not null, the suggested
+ * words are restricted to the words present in this field.
+ * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word
+ * (only if restricted mode = (indexReader!=null and field!=null)
+ * @throws IOException if the underlying index throws an {@link IOException}
+ * @throws AlreadyClosedException if the Spellchecker is already closed
+ * @return String[] the sorted list of the suggest words with these 2 criteria:
+ * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
+ * of the suggest words in the field of the user index
+ *
+ * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
+ */
+ public String[] suggestSimilar(String word, int numSug, IndexReader ir,
+ String field, boolean morePopular) throws IOException {
+ return suggestSimilar(word, numSug, ir, field, morePopular, accuracy);
+ }
+
+
+ /**
+ * Suggest similar words (optionally restricted to a field of an index).
+ *
+ *
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
+ * is not the same as the edit distance strategy used to calculate the best
+ * matching spell-checked word from the hits that Lucene found, one usually has
+ * to retrieve a couple of numSug's in order to get the true best match.
+ *
+ *
I.e. if numSug == 1, don't count on that suggestion being the best one.
+ * Thus, you should set this value to at least 5 for a good suggestion.
+ *
+ * @param word the word you want a spell check done on
+ * @param numSug the number of suggested words
+ * @param ir the indexReader of the user index (can be null see field param)
+ * @param field the field of the user index: if field is not null, the suggested
+ * words are restricted to the words present in this field.
+ * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word
+ * (only if restricted mode = (indexReader!=null and field!=null)
+ * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
+ * @throws IOException if the underlying index throws an {@link IOException}
+ * @throws AlreadyClosedException if the Spellchecker is already closed
+ * @return String[] the sorted list of the suggest words with these 2 criteria:
+ * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
+ * of the suggest words in the field of the user index
+ */
+ public String[] suggestSimilar(String word, int numSug, IndexReader ir,
+ String field, boolean morePopular, float accuracy) throws IOException {
+ // obtainSearcher calls ensureOpen
+ final IndexSearcher indexSearcher = obtainSearcher();
+ try{
+
+ final int lengthWord = word.length();
+
+ final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
+ final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
+ // if the word exists in the real index and we don't care for word frequency, return the word itself
+ if (!morePopular && freq > 0) {
+ return new String[] { word };
+ }
+
+ BooleanQuery query = new BooleanQuery();
+ String[] grams;
+ String key;
+
+ for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {
+
+ key = "gram" + ng; // form key
+
+ grams = formGrams(word, ng); // form word into ngrams (allow dups too)
+
+ if (grams.length == 0) {
+ continue; // hmm
+ }
+
+ if (bStart > 0) { // should we boost prefixes?
+ add(query, "start" + ng, grams[0], bStart); // matches start of word
+
+ }
+ if (bEnd > 0) { // should we boost suffixes
+ add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word
+
+ }
+ for (int i = 0; i < grams.length; i++) {
+ add(query, key, grams[i]);
+ }
+ }
+
+ int maxHits = 10 * numSug;
+
+ // System.out.println("Q: " + query);
+ ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs;
+ // System.out.println("HITS: " + hits.length());
+ SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);
+
+ // go thru more than 'maxr' matches in case the distance filter triggers
+ int stop = Math.min(hits.length, maxHits);
+ SuggestWord sugWord = new SuggestWord();
+ for (int i = 0; i < stop; i++) {
+
+ sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word
+
+ // don't suggest a word for itself, that would be silly
+ if (sugWord.string.equals(word)) {
+ continue;
+ }
+
+ // edit distance
+ sugWord.score = sd.getDistance(word,sugWord.string);
+ if (sugWord.score < accuracy) {
+ continue;
+ }
+
+ if (ir != null && field != null) { // use the user index
+ sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
+ // don't suggest a word that is not present in the field
+ if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
+ continue;
+ }
+ }
+ sugQueue.insertWithOverflow(sugWord);
+ if (sugQueue.size() == numSug) {
+ // if queue full, maintain the minScore score
+ accuracy = sugQueue.top().score;
+ }
+ sugWord = new SuggestWord();
+ }
+
+ // convert to array string
+ String[] list = new String[sugQueue.size()];
+ for (int i = sugQueue.size() - 1; i >= 0; i--) {
+ list[i] = sugQueue.pop().string;
+ }
+
+ return list;
+ } finally {
+ releaseSearcher(indexSearcher);
+ }
+ }
+ /**
+ * Add a clause to a boolean query.
+ */
+ private static void add(BooleanQuery q, String name, String value, float boost) {
+ Query tq = new TermQuery(new Term(name, value));
+ tq.setBoost(boost);
+ q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
+ }
+
+ /**
+ * Add a clause to a boolean query.
+ */
+ private static void add(BooleanQuery q, String name, String value) {
+ q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
+ }
+
+ /**
+ * Form all ngrams for a given word.
+ * @param text the word to parse
+ * @param ng the ngram length e.g. 3
+ * @return an array of all ngrams in the word and note that duplicates are not removed
+ */
+ private static String[] formGrams(String text, int ng) {
+ int len = text.length();
+ String[] res = new String[len - ng + 1];
+ for (int i = 0; i < len - ng + 1; i++) {
+ res[i] = text.substring(i, i + ng);
+ }
+ return res;
+ }
+
+ /**
+ * Removes all terms from the spell check index.
+ * @throws IOException
+ * @throws AlreadyClosedException if the Spellchecker is already closed
+ */
+ public void clearIndex() throws IOException {
+ synchronized (modifyCurrentIndexLock) {
+ ensureOpen();
+ final Directory dir = this.spellIndex;
+ final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
+ Version.LUCENE_CURRENT,
+ new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
+ .setOpenMode(OpenMode.CREATE));
+ writer.close();
+ swapSearcher(dir);
+ }
+ }
+
+ /**
+ * Check whether the word exists in the index.
+ * @param word
+ * @throws IOException
+ * @throws AlreadyClosedException if the Spellchecker is already closed
+ * @return true if the word exists in the index
+ */
+ public boolean exist(String word) throws IOException {
+ // obtainSearcher calls ensureOpen
+ final IndexSearcher indexSearcher = obtainSearcher();
+ try{
+ return indexSearcher.docFreq(F_WORD_TERM.createTerm(word)) > 0;
+ } finally {
+ releaseSearcher(indexSearcher);
+ }
+ }
+
+ /**
+ * Indexes the data from the given {@link Dictionary}.
+ * @param dict Dictionary to index
+ * @param mergeFactor mergeFactor to use when indexing
+ * @param ramMB the max amount or memory in MB to use
+ * @param optimize whether or not the spellcheck index should be optimized
+ * @throws AlreadyClosedException if the Spellchecker is already closed
+ * @throws IOException
+ */
+ public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB, boolean optimize) throws IOException {
+ synchronized (modifyCurrentIndexLock) {
+ ensureOpen();
+ final Directory dir = this.spellIndex;
+ final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)).setRAMBufferSizeMB(ramMB));
+ ((TieredMergePolicy) writer.getConfig().getMergePolicy()).setMaxMergeAtOnce(mergeFactor);
+ IndexSearcher indexSearcher = obtainSearcher();
+ final List termsEnums = new ArrayList();
+
+ if (searcher.maxDoc() > 0) {
+ new ReaderUtil.Gather(searcher.getIndexReader()) {
+ @Override
+ protected void add(int base, IndexReader r) throws IOException {
+ Terms terms = r.terms(F_WORD);
+ if (terms != null)
+ termsEnums.add(terms.iterator());
+ }
+ }.run();
+ }
+
+ boolean isEmpty = termsEnums.isEmpty();
+
+ try {
+ Iterator iter = dict.getWordsIterator();
+ BytesRef currentTerm = new BytesRef();
+
+ terms: while (iter.hasNext()) {
+ String word = iter.next();
+
+ int len = word.length();
+ if (len < 3) {
+ continue; // too short we bail but "too long" is fine...
+ }
+
+ if (!isEmpty) {
+ // we have a non-empty index, check if the term exists
+ currentTerm.copy(word);
+ for (TermsEnum te : termsEnums) {
+ if (te.seek(currentTerm, false) == TermsEnum.SeekStatus.FOUND) {
+ continue terms;
+ }
+ }
+ }
+
+ // ok index the word
+ Document doc = createDocument(word, getMin(len), getMax(len));
+ writer.addDocument(doc);
+ }
+ } finally {
+ releaseSearcher(indexSearcher);
+ }
+ // close writer
+ if (optimize)
+ writer.optimize();
+ writer.close();
+ // also re-open the spell index to see our own changes when the next suggestion
+ // is fetched:
+ swapSearcher(dir);
+ }
+ }
+
+ /**
+ * Indexes the data from the given {@link Dictionary}.
+ * @param dict the dictionary to index
+ * @param mergeFactor mergeFactor to use when indexing
+ * @param ramMB the max amount or memory in MB to use
+ * @throws IOException
+ */
+ public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException {
+ indexDictionary(dict, mergeFactor, ramMB, true);
+ }
+
+ /**
+ * Indexes the data from the given {@link Dictionary}.
+ * @param dict the dictionary to index
+ * @throws IOException
+ */
+ public final void indexDictionary(Dictionary dict) throws IOException {
+ indexDictionary(dict, 300, (int)IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
+ }
+
+ private static int getMin(int l) {
+ if (l > 5) {
+ return 3;
+ }
+ if (l == 5) {
+ return 2;
+ }
+ return 1;
+ }
+
+ private static int getMax(int l) {
+ if (l > 5) {
+ return 4;
+ }
+ if (l == 5) {
+ return 3;
+ }
+ return 2;
+ }
+
+ private static Document createDocument(String text, int ng1, int ng2) {
+ Document doc = new Document();
+ // the word field is never queried on... its indexed so it can be quickly
+ // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
+ Field f = new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
+ f.setOmitTermFreqAndPositions(true);
+ f.setOmitNorms(true);
+ doc.add(f); // orig term
+ addGram(text, doc, ng1, ng2);
+ return doc;
+ }
+
+ private static void addGram(String text, Document doc, int ng1, int ng2) {
+ int len = text.length();
+ for (int ng = ng1; ng <= ng2; ng++) {
+ String key = "gram" + ng;
+ String end = null;
+ for (int i = 0; i < len - ng + 1; i++) {
+ String gram = text.substring(i, i + ng);
+ doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
+ if (i == 0) {
+ // only one term possible in the startXXField, TF/pos and norms aren't needed.
+ Field startField = new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED);
+ startField.setOmitTermFreqAndPositions(true);
+ startField.setOmitNorms(true);
+ doc.add(startField);
+ }
+ end = gram;
+ }
+ if (end != null) { // may not be present if len==ng1
+ // only one term possible in the endXXField, TF/pos and norms aren't needed.
+ Field endField = new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED);
+ endField.setOmitTermFreqAndPositions(true);
+ endField.setOmitNorms(true);
+ doc.add(endField);
+ }
+ }
+ }
+
+ private IndexSearcher obtainSearcher() {
+ synchronized (searcherLock) {
+ ensureOpen();
+ searcher.getIndexReader().incRef();
+ return searcher;
+ }
+ }
+
+ private void releaseSearcher(final IndexSearcher aSearcher) throws IOException{
+ // don't check if open - always decRef
+ // don't decrement the private searcher - could have been swapped
+ aSearcher.getIndexReader().decRef();
+ }
+
+ private void ensureOpen() {
+ if (closed) {
+ throw new AlreadyClosedException("Spellchecker has been closed");
+ }
+ }
+
+ /**
+ * Close the IndexSearcher used by this SpellChecker
+ * @throws IOException if the close operation causes an {@link IOException}
+ * @throws AlreadyClosedException if the {@link SpellChecker} is already closed
+ */
+ public void close() throws IOException {
+ synchronized (searcherLock) {
+ ensureOpen();
+ closed = true;
+ if (searcher != null) {
+ searcher.close();
+ }
+ searcher = null;
+ }
+ }
+
+ private void swapSearcher(final Directory dir) throws IOException {
+ /*
+ * opening a searcher is possibly very expensive.
+ * We rather close it again if the Spellchecker was closed during
+ * this operation than block access to the current searcher while opening.
+ */
+ final IndexSearcher indexSearcher = createSearcher(dir);
+ synchronized (searcherLock) {
+ if(closed){
+ indexSearcher.close();
+ throw new AlreadyClosedException("Spellchecker has been closed");
+ }
+ if (searcher != null) {
+ searcher.close();
+ }
+ // set the spellindex in the sync block - ensure consistency.
+ searcher = indexSearcher;
+ this.spellIndex = dir;
+ }
+ }
+
+ /**
+ * Creates a new read-only IndexSearcher
+ * @param dir the directory used to open the searcher
+ * @return a new read-only IndexSearcher
+ * @throws IOException f there is a low-level IO error
+ */
+ // for testing purposes
+ IndexSearcher createSearcher(final Directory dir) throws IOException{
+ return new IndexSearcher(dir, true);
+ }
+
+ /**
+ * Returns true if and only if the {@link SpellChecker} is
+ * closed, otherwise false.
+ *
+ * @return true if and only if the {@link SpellChecker} is
+ * closed, otherwise false.
+ */
+ boolean isClosed(){
+ return closed;
+ }
+
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/StringDistance.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/StringDistance.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/StringDistance.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/StringDistance.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,35 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Interface for string distances.
+ */
+public interface StringDistance {
+
+ /**
+ * Returns a float between 0 and 1 based on how similar the specified strings are to one another.
+ * Returning a value of 1 means the specified strings are identical and 0 means the
+ * string are maximally different.
+ * @param s1 The first string.
+ * @param s2 The second string.
+ * @return a float between 0 and 1 based on how similar the specified strings are to one another.
+ */
+ public float getDistance(String s1,String s2);
+
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWord.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWord.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWord.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWord.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,45 @@
+package org.apache.lucene.search.spell;
+
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * SuggestWord, used in suggestSimilar method in SpellChecker class.
+ *
+ * Default sort is first by score, then by frequency.
+ *
+ *
+ */
+public final class SuggestWord{
+
+ /**
+ * the score of the word
+ */
+ public float score;
+
+ /**
+ * The freq of the word
+ */
+ public int freq;
+
+ /**
+ * the suggested word
+ */
+ public String string;
+
+}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,47 @@
+package org.apache.lucene.search.spell;
+
+import java.util.Comparator;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Frequency first, then score. Must have
+ *
+ **/
+public class SuggestWordFrequencyComparator implements Comparator {
+
+ public int compare(SuggestWord first, SuggestWord second) {
+ // first criteria: the frequency
+ if (first.freq > second.freq) {
+ return 1;
+ }
+ if (first.freq < second.freq) {
+ return -1;
+ }
+
+ // second criteria (if first criteria is equal): the score
+ if (first.score > second.score) {
+ return 1;
+ }
+ if (first.score < second.score) {
+ return -1;
+ }
+ // third criteria: term text
+ return second.string.compareTo(first.string);
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,63 @@
+package org.apache.lucene.search.spell;
+
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.PriorityQueue;
+
+import java.util.Comparator;
+
+
+/**
+ * Sorts SuggestWord instances
+ *
+ * @see org.apache.lucene.search.spell.SuggestWordScoreComparator
+ * @see org.apache.lucene.search.spell.SuggestWordFrequencyComparator
+ *
+ */
+public final class SuggestWordQueue extends PriorityQueue {
+ public static final Comparator DEFAULT_COMPARATOR = new SuggestWordScoreComparator();
+
+
+ private Comparator comparator;
+
+ /**
+ * Use the {@link #DEFAULT_COMPARATOR}
+ * @param size The size of the queue
+ */
+ public SuggestWordQueue (int size) {
+ super(size);
+ comparator = DEFAULT_COMPARATOR;
+ }
+
+ /**
+ * Specify the size of the queue and the comparator to use for sorting.
+ * @param size The size
+ * @param comparator The comparator.
+ */
+ public SuggestWordQueue(int size, Comparator comparator){
+ super(size);
+ this.comparator = comparator;
+ }
+
+ @Override
+ protected final boolean lessThan (SuggestWord wa, SuggestWord wb) {
+ int val = comparator.compare(wa, wb);
+ return val < 0;
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,47 @@
+package org.apache.lucene.search.spell;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Comparator;
+
+
+/**
+ * Score first, then frequency
+ *
+ **/
+public class SuggestWordScoreComparator implements Comparator {
+ public int compare(SuggestWord first, SuggestWord second) {
+ // first criteria: the distance
+ if (first.score > second.score) {
+ return 1;
+ }
+ if (first.score < second.score) {
+ return -1;
+ }
+
+ // second criteria (if first criteria is equal): the popularity
+ if (first.freq > second.freq) {
+ return 1;
+ }
+
+ if (first.freq < second.freq) {
+ return -1;
+ }
+ // third criteria: term text
+ return second.string.compareTo(first.string);
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java 2011-05-22 19:00:30.000000000 -0400
@@ -0,0 +1,33 @@
+package org.apache.lucene.search.spell;
+
+import java.util.Iterator;
+
+public interface TermFreqIterator extends Iterator {
+
+ public float freq();
+
+ public static class TermFreqIteratorWrapper implements TermFreqIterator {
+ private Iterator wrapped;
+
+ public TermFreqIteratorWrapper(Iterator wrapped) {
+ this.wrapped = wrapped;
+ }
+
+ public float freq() {
+ return 1.0f;
+ }
+
+ public boolean hasNext() {
+ return wrapped.hasNext();
+ }
+
+ public String next() {
+ return wrapped.next().toString();
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java 2011-05-22 17:03:03.000000000 -0400
@@ -0,0 +1,65 @@
+package org.apache.lucene.search.suggest;
+
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.search.spell.TermFreqIterator;
+
+/**
+ * This wrapper buffers incoming elements.
+ */
+public class BufferingTermFreqIteratorWrapper implements TermFreqIterator {
+
+ /** Entry in the buffer. */
+ public static final class Entry implements Comparable {
+ String word;
+ float freq;
+
+ public Entry(String word, float freq) {
+ this.word = word;
+ this.freq = freq;
+ }
+
+ public int compareTo(Entry o) {
+ return word.compareTo(o.word);
+ }
+ }
+
+ protected ArrayList entries = new ArrayList();
+
+ protected int curPos;
+ protected Entry curEntry;
+
+ public BufferingTermFreqIteratorWrapper(TermFreqIterator source) {
+ // read all source data into buffer
+ while (source.hasNext()) {
+ String w = source.next();
+ Entry e = new Entry(w, source.freq());
+ entries.add(e);
+ }
+ curPos = 0;
+ }
+
+ public float freq() {
+ return curEntry.freq;
+ }
+
+ public boolean hasNext() {
+ return curPos < entries.size();
+ }
+
+ public String next() {
+ curEntry = entries.get(curPos);
+ curPos++;
+ return curEntry.word;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException("remove is not supported");
+ }
+
+ public List entries() {
+ return entries;
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java 2011-05-22 17:03:10.000000000 -0400
@@ -0,0 +1,95 @@
+package org.apache.lucene.search.suggest;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.*;
+
+import org.apache.lucene.search.spell.Dictionary;
+import org.apache.lucene.search.spell.TermFreqIterator;
+
+
+/**
+ * Dictionary represented by a text file.
+ *
+ * Format allowed: 1 string per line, optionally with a tab-separated integer value:
+ * word1 TAB 100
+ * word2 word3 TAB 101
+ * word4 word5 TAB 102
+ */
+public class FileDictionary implements Dictionary {
+
+ private BufferedReader in;
+ private String line;
+ private boolean hasNextCalled;
+
+ public FileDictionary(InputStream dictFile) {
+ in = new BufferedReader(new InputStreamReader(dictFile));
+ }
+
+ /**
+ * Creates a dictionary based on a reader.
+ */
+ public FileDictionary(Reader reader) {
+ in = new BufferedReader(reader);
+ }
+
+ public TermFreqIterator getWordsIterator() {
+ return new fileIterator();
+ }
+
+ final class fileIterator implements TermFreqIterator {
+ private float curFreq;
+
+ public String next() {
+ if (!hasNextCalled) {
+ hasNext();
+ }
+ hasNextCalled = false;
+ return line;
+ }
+
+ public float freq() {
+ return curFreq;
+ }
+
+ public boolean hasNext() {
+ hasNextCalled = true;
+ try {
+ line = in.readLine();
+ if (line != null) {
+ String[] fields = line.split("\t");
+ if (fields.length > 1) {
+ curFreq = Float.parseFloat(fields[1]);
+ line = fields[0];
+ } else {
+ curFreq = 1;
+ }
+ }
+ } catch (IOException ex) {
+ throw new RuntimeException(ex);
+ }
+ return (line != null) ? true : false;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java 2011-05-22 18:18:14.000000000 -0400
@@ -0,0 +1,117 @@
+package org.apache.lucene.search.suggest;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.search.spell.Dictionary;
+import org.apache.lucene.search.spell.TermFreqIterator;
+import org.apache.lucene.util.PriorityQueue;
+
+public abstract class Lookup {
+ /**
+ * Result of a lookup.
+ */
+ public static final class LookupResult implements Comparable {
+ public final String key;
+ public final float value;
+
+ public LookupResult(String key, float value) {
+ this.key = key;
+ this.value = value;
+ }
+
+ @Override
+ public String toString() {
+ return key + "/" + value;
+ }
+
+ /** Compare alphabetically. */
+ public int compareTo(LookupResult o) {
+ return this.key.compareTo(o.key);
+ }
+ }
+
+ public static final class LookupPriorityQueue extends PriorityQueue {
+
+ public LookupPriorityQueue(int size) {
+ super(size);
+ }
+
+ @Override
+ protected boolean lessThan(LookupResult a, LookupResult b) {
+ return a.value < b.value;
+ }
+
+ public LookupResult[] getResults() {
+ int size = size();
+ LookupResult[] res = new LookupResult[size];
+ for (int i = size - 1; i >= 0; i--) {
+ res[i] = pop();
+ }
+ return res;
+ }
+ }
+
+ /** Build lookup from a dictionary. Some implementations may require sorted
+ * or unsorted keys from the dictionary's iterator - use
+ * {@link SortedTermFreqIteratorWrapper} or
+ * {@link UnsortedTermFreqIteratorWrapper} in such case.
+ */
+ public void build(Dictionary dict) throws IOException {
+ Iterator it = dict.getWordsIterator();
+ TermFreqIterator tfit;
+ if (it instanceof TermFreqIterator) {
+ tfit = (TermFreqIterator)it;
+ } else {
+ tfit = new TermFreqIterator.TermFreqIteratorWrapper(it);
+ }
+ build(tfit);
+ }
+
+ public abstract void build(TermFreqIterator tfit) throws IOException;
+
+ /**
+ * Persist the constructed lookup data to a directory. Optional operation.
+ * @param storeDir directory where data can be stored.
+ * @return true if successful, false if unsuccessful or not supported.
+ * @throws IOException when fatal IO error occurs.
+ */
+ public abstract boolean store(File storeDir) throws IOException;
+
+ /**
+ * Discard current lookup data and load it from a previously saved copy.
+ * Optional operation.
+ * @param storeDir directory where lookup data was stored.
+ * @return true if completed successfully, false if unsuccessful or not supported.
+ * @throws IOException when fatal IO error occurs.
+ */
+ public abstract boolean load(File storeDir) throws IOException;
+
+ /**
+ * Look up a key and return possible completion for this key.
+ * @param key lookup key. Depending on the implementation this may be
+ * a prefix, misspelling, or even infix.
+ * @param onlyMorePopular return only more popular results
+ * @param num maximum number of results to return
+ * @return a list of possible completions, with their relative weight (e.g. popularity)
+ */
+ public abstract List lookup(String key, boolean onlyMorePopular, int num);
+
+ /**
+ * Modify the lookup data by recording additional data. Optional operation.
+ * @param key new lookup key
+ * @param value value to associate with this key
+ * @return true if new key is added, false if it already exists or operation
+ * is not supported.
+ */
+ public abstract boolean add(String key, Object value);
+
+ /**
+ * Get value associated with a specific key.
+ * @param key lookup key
+ * @return associated value
+ */
+ public abstract Object get(String key);
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java 2011-05-22 17:03:26.000000000 -0400
@@ -0,0 +1,18 @@
+package org.apache.lucene.search.suggest;
+
+import java.util.Collections;
+
+import org.apache.lucene.search.spell.SortedIterator;
+import org.apache.lucene.search.spell.TermFreqIterator;
+
+/**
+ * This wrapper buffers incoming elements and makes sure they are sorted in
+ * ascending lexicographic order.
+ */
+public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper implements SortedIterator {
+
+ public SortedTermFreqIteratorWrapper(TermFreqIterator source) {
+ super(source);
+ Collections.sort(entries);
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java 2011-05-22 17:03:33.000000000 -0400
@@ -0,0 +1,17 @@
+package org.apache.lucene.search.suggest;
+
+import java.util.Collections;
+
+import org.apache.lucene.search.spell.TermFreqIterator;
+
+/**
+ * This wrapper buffers the incoming elements and makes sure they are in
+ * random order.
+ */
+public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
+
+ public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) {
+ super(source);
+ Collections.shuffle(entries);
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java 2011-05-22 18:59:15.000000000 -0400
@@ -0,0 +1,540 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.fst.Builder;
+import org.apache.lucene.util.automaton.fst.FST;
+import org.apache.lucene.util.automaton.fst.FST.Arc;
+import org.apache.lucene.util.automaton.fst.NoOutputs;
+import org.apache.lucene.util.automaton.fst.Outputs;
+
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.tst.TSTLookup;
+import org.apache.lucene.search.spell.TermFreqIterator;
+
+/**
+ * Finite state automata based implementation of {@link Lookup} query
+ * suggestion/ autocomplete interface.
+ *
+ *
Implementation details
+ *
+ *
The construction step in {@link #build(TermFreqIterator)} works as follows:
+ *
+ *
A set of input terms (String) and weights (float) is given.
+ *
The range of weights is determined and then all weights are discretized into a fixed set
+ * of values ({@link #buckets}).
+ * Note that this means that minor changes in weights may be lost during automaton construction.
+ * In general, this is not a big problem because the "priorities" of completions can be split
+ * into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
+ * If you need exact, fine-grained weights, use {@link TSTLookup} instead.
+ *
All terms in the input are preprended with a synthetic pseudo-character being the weight
+ * of that term. For example a term abc with a discretized weight equal '1' would
+ * become 1abc.
+ *
The terms are sorted by their raw value of utf16 character values (including the synthetic
+ * term in front).
+ *
A finite state automaton ({@link FST}) is constructed from the input. The root node has
+ * arcs labeled with all possible weights. We cache all these arcs, highest-weight first.
+ *
+ *
+ *
At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
+ *
+ *
For each possible term weight encoded in the automaton (cached arcs from the root above),
+ * starting with the highest one, we descend along the path of the input key. If the key is not
+ * a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
+ * No completions.
+ *
Otherwise, we have found an internal automaton node that ends the key. The entire
+ * subautomaton (all paths) starting from this node form the key's completions. We start
+ * the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
+ * suggestion to the list of results (the weight of this suggestion is constant and equal to the
+ * root path we started from). The tricky part is that because automaton edges are sorted and
+ * we scan depth-first, we can terminate the entire procedure as soon as we collect enough
+ * suggestions the user requested.
+ *
In case the number of suggestions collected in the step above is still insufficient,
+ * we proceed to the next (smaller) weight leaving the root node and repeat the same
+ * algorithm again.
+ *
+ *
+ *
+ *
Runtime behavior and performance characteristic
+ *
+ *
The algorithm described above is optimized for finding suggestions to short prefixes
+ * in a top-weights-first order. This is probably the most common use case: it allows
+ * presenting suggestions early and sorts them by the global frequency (and then alphabetically).
+ *
+ *
If there is an exact match in the automaton, it is returned first on the results
+ * list (even with by-weight sorting).
+ *
+ *
Note that the maximum lookup time for any prefix
+ * is the time of descending to the subtree, plus traversal of the subtree up to the number
+ * of requested suggestions (because they are already presorted by weight on the root level
+ * and alphabetically at any node level).
+ *
+ *
To order alphabetically only (no ordering by priorities), use identical term weights
+ * for all terms. Alphabetical suggestions are returned even if non-constant weights are
+ * used, but the algorithm for doing this is suboptimal.
+ *
+ *
"alphabetically" in any of the documentation above indicates utf16 codepoint order,
+ * nothing else.
+ */
+public class FSTLookup extends Lookup {
+
+ public FSTLookup() {
+ this(10, true);
+ }
+
+ public FSTLookup(int buckets, boolean exactMatchFirst) {
+ this.buckets = buckets;
+ this.exactMatchFirst = exactMatchFirst;
+ }
+
+ /** A structure for a single entry (for sorting/ preprocessing). */
+ private static class Entry {
+ char [] term;
+ float weight;
+
+ public Entry(char [] term, float freq) {
+ this.term = term;
+ this.weight = freq;
+ }
+ }
+
+ /** Serialized automaton file name (storage). */
+ public static final String FILENAME = "fst.dat";
+
+ /** An empty result. */
+ private static final List EMPTY_RESULT = Collections.emptyList();
+
+ /**
+ * The number of separate buckets for weights (discretization). The more buckets,
+ * the more fine-grained term weights (priorities) can be assigned. The speed of lookup
+ * will not decrease for prefixes which have highly-weighted completions (because these
+ * are filled-in first), but will decrease significantly for low-weighted terms (but
+ * these should be infrequent, so it is all right).
+ *
+ *
The number of buckets must be within [1, 255] range.
+ */
+ private final int buckets;
+
+ /**
+ * If true, exact suggestions are returned first, even if they are prefixes
+ * of other strings in the automaton (possibly with larger weights).
+ */
+ private final boolean exactMatchFirst;
+
+ /**
+ * Finite state automaton encoding all the lookup terms. See class
+ * notes for details.
+ */
+ private FST
+ *
+ *
+ * This data structure is faster than hashing for many typical search problems,
+ * and supports a broader range of useful problems and operations. Ternary
+ * searches are faster than hashing and more powerful, too.
+ *
+ *
+ *
+ * The theory of ternary search trees was described at a symposium in 1997 (see
+ * "Fast Algorithms for Sorting and Searching Strings," by J.L. Bentley and R.
+ * Sedgewick, Proceedings of the 8th Annual ACM-SIAM Symposium on Discrete
+ * Algorithms, January 1997). Algorithms in C, Third Edition, by Robert
+ * Sedgewick (Addison-Wesley, 1998) provides yet another view of ternary search
+ * trees.
+ *
+ * @author Bruno Martins
+ *
+ */
+public class JaspellTernarySearchTrie {
+
+ /**
+ * An inner class of Ternary Search Trie that represents a node in the trie.
+ */
+ protected final class TSTNode {
+
+ /** Index values for accessing relatives array. */
+ protected final static int PARENT = 0, LOKID = 1, EQKID = 2, HIKID = 3;
+
+ /** The key to the node. */
+ protected Object data;
+
+ /** The relative nodes. */
+ protected TSTNode[] relatives = new TSTNode[4];
+
+ /** The char used in the split. */
+ protected char splitchar;
+
+ /**
+ * Constructor method.
+ *
+ *@param splitchar
+ * The char used in the split.
+ *@param parent
+ * The parent node.
+ */
+ protected TSTNode(char splitchar, TSTNode parent) {
+ this.splitchar = splitchar;
+ relatives[PARENT] = parent;
+ }
+ }
+
+ /**
+ * Compares characters by alfabetical order.
+ *
+ *@param cCompare2
+ * The first char in the comparison.
+ *@param cRef
+ * The second char in the comparison.
+ *@return A negative number, 0 or a positive number if the second char is
+ * less, equal or greater.
+ */
+ private static int compareCharsAlphabetically(char cCompare2, char cRef) {
+ return Character.toLowerCase(cCompare2) - Character.toLowerCase(cRef);
+ }
+
+ /* what follows is the original Jaspell code.
+ private static int compareCharsAlphabetically(int cCompare2, int cRef) {
+ int cCompare = 0;
+ if (cCompare2 >= 65) {
+ if (cCompare2 < 89) {
+ cCompare = (2 * cCompare2) - 65;
+ } else if (cCompare2 < 97) {
+ cCompare = cCompare2 + 24;
+ } else if (cCompare2 < 121) {
+ cCompare = (2 * cCompare2) - 128;
+ } else cCompare = cCompare2;
+ } else cCompare = cCompare2;
+ if (cRef < 65) {
+ return cCompare - cRef;
+ }
+ if (cRef < 89) {
+ return cCompare - ((2 * cRef) - 65);
+ }
+ if (cRef < 97) {
+ return cCompare - (cRef + 24);
+ }
+ if (cRef < 121) {
+ return cCompare - ((2 * cRef) - 128);
+ }
+ return cCompare - cRef;
+ }
+ */
+
+ /**
+ * The default number of values returned by the matchAlmost
+ * method.
+ */
+ private int defaultNumReturnValues = -1;
+
+ /**
+ * the number of differences allowed in a call to the
+ * matchAlmostKey method.
+ */
+ private int matchAlmostDiff;
+
+ /** The base node in the trie. */
+ private TSTNode rootNode;
+
+ /**
+ * Constructs an empty Ternary Search Trie.
+ */
+ public JaspellTernarySearchTrie() {
+ }
+
+ // for loading
+ void setRoot(TSTNode newRoot) {
+ rootNode = newRoot;
+ }
+
+ // for saving
+ TSTNode getRoot() {
+ return rootNode;
+ }
+
+ /**
+ * Constructs a Ternary Search Trie and loads data from a File
+ * into the Trie. The file is a normal text document, where each line is of
+ * the form word TAB float.
+ *
+ *@param file
+ * The File with the data to load into the Trie.
+ *@exception IOException
+ * A problem occured while reading the data.
+ */
+ public JaspellTernarySearchTrie(File file) throws IOException {
+ this(file, false);
+ }
+
+ /**
+ * Constructs a Ternary Search Trie and loads data from a File
+ * into the Trie. The file is a normal text document, where each line is of
+ * the form "word TAB float".
+ *
+ *@param file
+ * The File with the data to load into the Trie.
+ *@param compression
+ * If true, the file is compressed with the GZIP algorithm, and if
+ * false, the file is a normal text document.
+ *@exception IOException
+ * A problem occured while reading the data.
+ */
+ public JaspellTernarySearchTrie(File file, boolean compression)
+ throws IOException {
+ this();
+ BufferedReader in;
+ if (compression)
+ in = new BufferedReader(new InputStreamReader(new GZIPInputStream(
+ new FileInputStream(file))));
+ else in = new BufferedReader(new InputStreamReader((new FileInputStream(
+ file))));
+ String word;
+ int pos;
+ Float occur, one = new Float(1);
+ int numWords = 0;
+ while ((word = in.readLine()) != null) {
+ numWords++;
+ pos = word.indexOf("\t");
+ occur = one;
+ if (pos != -1) {
+ occur = Float.parseFloat(word.substring(pos + 1).trim());
+ word = word.substring(0, pos);
+ }
+ String key = word.toLowerCase();
+ if (rootNode == null) {
+ rootNode = new TSTNode(key.charAt(0), null);
+ }
+ TSTNode node = null;
+ if (key.length() > 0 && rootNode != null) {
+ TSTNode currentNode = rootNode;
+ int charIndex = 0;
+ while (true) {
+ if (currentNode == null) break;
+ int charComp = compareCharsAlphabetically(key.charAt(charIndex),
+ currentNode.splitchar);
+ if (charComp == 0) {
+ charIndex++;
+ if (charIndex == key.length()) {
+ node = currentNode;
+ break;
+ }
+ currentNode = currentNode.relatives[TSTNode.EQKID];
+ } else if (charComp < 0) {
+ currentNode = currentNode.relatives[TSTNode.LOKID];
+ } else {
+ currentNode = currentNode.relatives[TSTNode.HIKID];
+ }
+ }
+ Float occur2 = null;
+ if (node != null) occur2 = ((Float) (node.data));
+ if (occur2 != null) {
+ occur += occur2.floatValue();
+ }
+ currentNode = getOrCreateNode(word.trim().toLowerCase());
+ currentNode.data = occur;
+ }
+ }
+ in.close();
+ }
+
+ /**
+ * Deletes the node passed in as an argument. If this node has non-null data,
+ * then both the node and the data will be deleted. It also deletes any other
+ * nodes in the trie that are no longer needed after the deletion of the node.
+ *
+ *@param nodeToDelete
+ * The node to delete.
+ */
+ private void deleteNode(TSTNode nodeToDelete) {
+ if (nodeToDelete == null) {
+ return;
+ }
+ nodeToDelete.data = null;
+ while (nodeToDelete != null) {
+ nodeToDelete = deleteNodeRecursion(nodeToDelete);
+ // deleteNodeRecursion(nodeToDelete);
+ }
+ }
+
+ /**
+ * Recursively visits each node to be deleted.
+ *
+ * To delete a node, first set its data to null, then pass it into this
+ * method, then pass the node returned by this method into this method (make
+ * sure you don't delete the data of any of the nodes returned from this
+ * method!) and continue in this fashion until the node returned by this
+ * method is null.
+ *
+ * The TSTNode instance returned by this method will be next node to be
+ * operated on by deleteNodeRecursion (This emulates recursive
+ * method call while avoiding the JVM overhead normally associated with a
+ * recursive method.)
+ *
+ *@param currentNode
+ * The node to delete.
+ *@return The next node to be called in deleteNodeRecursion.
+ */
+ private TSTNode deleteNodeRecursion(TSTNode currentNode) {
+ if (currentNode == null) {
+ return null;
+ }
+ if (currentNode.relatives[TSTNode.EQKID] != null
+ || currentNode.data != null) {
+ return null;
+ }
+ // can't delete this node if it has a non-null eq kid or data
+ TSTNode currentParent = currentNode.relatives[TSTNode.PARENT];
+ boolean lokidNull = currentNode.relatives[TSTNode.LOKID] == null;
+ boolean hikidNull = currentNode.relatives[TSTNode.HIKID] == null;
+ int childType;
+ if (currentParent.relatives[TSTNode.LOKID] == currentNode) {
+ childType = TSTNode.LOKID;
+ } else if (currentParent.relatives[TSTNode.EQKID] == currentNode) {
+ childType = TSTNode.EQKID;
+ } else if (currentParent.relatives[TSTNode.HIKID] == currentNode) {
+ childType = TSTNode.HIKID;
+ } else {
+ rootNode = null;
+ return null;
+ }
+ if (lokidNull && hikidNull) {
+ currentParent.relatives[childType] = null;
+ return currentParent;
+ }
+ if (lokidNull) {
+ currentParent.relatives[childType] = currentNode.relatives[TSTNode.HIKID];
+ currentNode.relatives[TSTNode.HIKID].relatives[TSTNode.PARENT] = currentParent;
+ return currentParent;
+ }
+ if (hikidNull) {
+ currentParent.relatives[childType] = currentNode.relatives[TSTNode.LOKID];
+ currentNode.relatives[TSTNode.LOKID].relatives[TSTNode.PARENT] = currentParent;
+ return currentParent;
+ }
+ int deltaHi = currentNode.relatives[TSTNode.HIKID].splitchar
+ - currentNode.splitchar;
+ int deltaLo = currentNode.splitchar
+ - currentNode.relatives[TSTNode.LOKID].splitchar;
+ int movingKid;
+ TSTNode targetNode;
+ if (deltaHi == deltaLo) {
+ if (Math.random() < 0.5) {
+ deltaHi++;
+ } else {
+ deltaLo++;
+ }
+ }
+ if (deltaHi > deltaLo) {
+ movingKid = TSTNode.HIKID;
+ targetNode = currentNode.relatives[TSTNode.LOKID];
+ } else {
+ movingKid = TSTNode.LOKID;
+ targetNode = currentNode.relatives[TSTNode.HIKID];
+ }
+ while (targetNode.relatives[movingKid] != null) {
+ targetNode = targetNode.relatives[movingKid];
+ }
+ targetNode.relatives[movingKid] = currentNode.relatives[movingKid];
+ currentParent.relatives[childType] = targetNode;
+ targetNode.relatives[TSTNode.PARENT] = currentParent;
+ if (!lokidNull) {
+ currentNode.relatives[TSTNode.LOKID] = null;
+ }
+ if (!hikidNull) {
+ currentNode.relatives[TSTNode.HIKID] = null;
+ }
+ return currentParent;
+ }
+
+ /**
+ * Retrieve the object indexed by a key.
+ *
+ *@param key
+ * A String index.
+ *@return The object retrieved from the Ternary Search Trie.
+ */
+ public Object get(String key) {
+ TSTNode node = getNode(key.trim().toLowerCase());
+ if (node == null) {
+ return null;
+ }
+ return node.data;
+ }
+
+ /**
+ * Retrieve the Float indexed by key, increment it by one unit
+ * and store the new Float.
+ *
+ *@param key
+ * A String index.
+ *@return The Float retrieved from the Ternary Search Trie.
+ */
+ public Float getAndIncrement(String key) {
+ String key2 = key.trim().toLowerCase();
+ TSTNode node = getNode(key2);
+ if (node == null) {
+ return null;
+ }
+ Float aux = (Float) (node.data);
+ if (aux == null) {
+ aux = new Float(1);
+ } else {
+ aux = new Float(aux.intValue() + 1);
+ }
+ put(key2, aux);
+ return aux;
+ }
+
+ /**
+ * Returns the key that indexes the node argument.
+ *
+ *@param node
+ * The node whose index is to be calculated.
+ *@return The String that indexes the node argument.
+ */
+ protected String getKey(TSTNode node) {
+ StringBuffer getKeyBuffer = new StringBuffer();
+ getKeyBuffer.setLength(0);
+ getKeyBuffer.append("" + node.splitchar);
+ TSTNode currentNode;
+ TSTNode lastNode;
+ currentNode = node.relatives[TSTNode.PARENT];
+ lastNode = node;
+ while (currentNode != null) {
+ if (currentNode.relatives[TSTNode.EQKID] == lastNode) {
+ getKeyBuffer.append("" + currentNode.splitchar);
+ }
+ lastNode = currentNode;
+ currentNode = currentNode.relatives[TSTNode.PARENT];
+ }
+ getKeyBuffer.reverse();
+ return getKeyBuffer.toString();
+ }
+
+ /**
+ * Returns the node indexed by key, or null if that node doesn't
+ * exist. Search begins at root node.
+ *
+ *@param key
+ * A String that indexes the node that is returned.
+ *@return The node object indexed by key. This object is an instance of an
+ * inner class named TernarySearchTrie.TSTNode.
+ */
+ public TSTNode getNode(String key) {
+ return getNode(key, rootNode);
+ }
+
+ /**
+ * Returns the node indexed by key, or null if that node doesn't
+ * exist. The search begins at root node.
+ *
+ *@param key2
+ * A String that indexes the node that is returned.
+ *@param startNode
+ * The top node defining the subtrie to be searched.
+ *@return The node object indexed by key. This object is an instance of an
+ * inner class named TernarySearchTrie.TSTNode.
+ */
+ protected TSTNode getNode(String key2, TSTNode startNode) {
+ String key = key2.trim().toLowerCase();
+ if (key == null || startNode == null || key.length() == 0) {
+ return null;
+ }
+ TSTNode currentNode = startNode;
+ int charIndex = 0;
+ while (true) {
+ if (currentNode == null) {
+ return null;
+ }
+ int charComp = compareCharsAlphabetically(key.charAt(charIndex),
+ currentNode.splitchar);
+ if (charComp == 0) {
+ charIndex++;
+ if (charIndex == key.length()) {
+ return currentNode;
+ }
+ currentNode = currentNode.relatives[TSTNode.EQKID];
+ } else if (charComp < 0) {
+ currentNode = currentNode.relatives[TSTNode.LOKID];
+ } else {
+ currentNode = currentNode.relatives[TSTNode.HIKID];
+ }
+ }
+ }
+
+ /**
+ * Returns the node indexed by key, creating that node if it doesn't exist,
+ * and creating any required intermediate nodes if they don't exist.
+ *
+ *@param key
+ * A String that indexes the node that is returned.
+ *@return The node object indexed by key. This object is an instance of an
+ * inner class named TernarySearchTrie.TSTNode.
+ *@exception NullPointerException
+ * If the key is null.
+ *@exception IllegalArgumentException
+ * If the key is an empty String.
+ */
+ protected TSTNode getOrCreateNode(String key) throws NullPointerException,
+ IllegalArgumentException {
+ if (key == null) {
+ throw new NullPointerException(
+ "attempt to get or create node with null key");
+ }
+ if (key.length() == 0) {
+ throw new IllegalArgumentException(
+ "attempt to get or create node with key of zero length");
+ }
+ if (rootNode == null) {
+ rootNode = new TSTNode(key.charAt(0), null);
+ }
+ TSTNode currentNode = rootNode;
+ int charIndex = 0;
+ while (true) {
+ int charComp = compareCharsAlphabetically(key.charAt(charIndex),
+ currentNode.splitchar);
+ if (charComp == 0) {
+ charIndex++;
+ if (charIndex == key.length()) {
+ return currentNode;
+ }
+ if (currentNode.relatives[TSTNode.EQKID] == null) {
+ currentNode.relatives[TSTNode.EQKID] = new TSTNode(key
+ .charAt(charIndex), currentNode);
+ }
+ currentNode = currentNode.relatives[TSTNode.EQKID];
+ } else if (charComp < 0) {
+ if (currentNode.relatives[TSTNode.LOKID] == null) {
+ currentNode.relatives[TSTNode.LOKID] = new TSTNode(key
+ .charAt(charIndex), currentNode);
+ }
+ currentNode = currentNode.relatives[TSTNode.LOKID];
+ } else {
+ if (currentNode.relatives[TSTNode.HIKID] == null) {
+ currentNode.relatives[TSTNode.HIKID] = new TSTNode(key
+ .charAt(charIndex), currentNode);
+ }
+ currentNode = currentNode.relatives[TSTNode.HIKID];
+ }
+ }
+ }
+
+ /**
+ * Returns a List of keys that almost match the argument key.
+ * Keys returned will have exactly diff characters that do not match the
+ * target key, where diff is equal to the last value passed in as an argument
+ * to the setMatchAlmostDiff method.
+ *
+ * If the matchAlmost method is called before the
+ * setMatchAlmostDiff method has been called for the first time,
+ * then diff = 0.
+ *
+ *@param key
+ * The target key.
+ *@return A List with the results.
+ */
+ public List matchAlmost(String key) {
+ return matchAlmost(key, defaultNumReturnValues);
+ }
+
+ /**
+ * Returns a List of keys that almost match the argument key.
+ * Keys returned will have exactly diff characters that do not match the
+ * target key, where diff is equal to the last value passed in as an argument
+ * to the setMatchAlmostDiff method.
+ *
+ * If the matchAlmost method is called before the
+ * setMatchAlmostDiff method has been called for the first time,
+ * then diff = 0.
+ *
+ *@param key
+ * The target key.
+ *@param numReturnValues
+ * The maximum number of values returned by this method.
+ *@return A List with the results
+ */
+ public List matchAlmost(String key, int numReturnValues) {
+ return matchAlmostRecursion(rootNode, 0, matchAlmostDiff, key,
+ ((numReturnValues < 0) ? -1 : numReturnValues), new Vector(), false);
+ }
+
+ /**
+ * Recursivelly vists the nodes in order to find the ones that almost match a
+ * given key.
+ *
+ *@param currentNode
+ * The current node.
+ *@param charIndex
+ * The current char.
+ *@param d
+ * The number of differences so far.
+ *@param matchAlmostNumReturnValues
+ * The maximum number of values in the result List.
+ *@param matchAlmostResult2
+ * The results so far.
+ *@param upTo
+ * If true all keys having up to and including matchAlmostDiff
+ * mismatched letters will be included in the result (including a key
+ * that is exactly the same as the target string) otherwise keys will
+ * be included in the result only if they have exactly
+ * matchAlmostDiff number of mismatched letters.
+ *@param matchAlmostKey
+ * The key being searched.
+ *@return A List with the results.
+ */
+ private List matchAlmostRecursion(TSTNode currentNode, int charIndex,
+ int d, String matchAlmostKey, int matchAlmostNumReturnValues,
+ List matchAlmostResult2, boolean upTo) {
+ if ((currentNode == null)
+ || (matchAlmostNumReturnValues != -1 && matchAlmostResult2.size() >= matchAlmostNumReturnValues)
+ || (d < 0) || (charIndex >= matchAlmostKey.length())) {
+ return matchAlmostResult2;
+ }
+ int charComp = compareCharsAlphabetically(matchAlmostKey.charAt(charIndex),
+ currentNode.splitchar);
+ List matchAlmostResult = matchAlmostResult2;
+ if ((d > 0) || (charComp < 0)) {
+ matchAlmostResult = matchAlmostRecursion(
+ currentNode.relatives[TSTNode.LOKID], charIndex, d,
+ matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult,
+ upTo);
+ }
+ int nextD = (charComp == 0) ? d : d - 1;
+ boolean cond = (upTo) ? (nextD >= 0) : (nextD == 0);
+ if ((matchAlmostKey.length() == charIndex + 1) && cond
+ && (currentNode.data != null)) {
+ matchAlmostResult.add(getKey(currentNode));
+ }
+ matchAlmostResult = matchAlmostRecursion(
+ currentNode.relatives[TSTNode.EQKID], charIndex + 1, nextD,
+ matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, upTo);
+ if ((d > 0) || (charComp > 0)) {
+ matchAlmostResult = matchAlmostRecursion(
+ currentNode.relatives[TSTNode.HIKID], charIndex, d,
+ matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult,
+ upTo);
+ }
+ return matchAlmostResult;
+ }
+
+ /**
+ * Returns an alphabetical List of all keys in the trie that
+ * begin with a given prefix. Only keys for nodes having non-null data are
+ * included in the List.
+ *
+ *@param prefix
+ * Each key returned from this method will begin with the characters
+ * in prefix.
+ *@return A List with the results.
+ */
+ public List matchPrefix(String prefix) {
+ return matchPrefix(prefix, defaultNumReturnValues);
+ }
+
+ /**
+ * Returns an alphabetical List of all keys in the trie that
+ * begin with a given prefix. Only keys for nodes having non-null data are
+ * included in the List.
+ *
+ *@param prefix
+ * Each key returned from this method will begin with the characters
+ * in prefix.
+ *@param numReturnValues
+ * The maximum number of values returned from this method.
+ *@return A List with the results
+ */
+ public List matchPrefix(String prefix, int numReturnValues) {
+ Vector sortKeysResult = new Vector();
+ TSTNode startNode = getNode(prefix);
+ if (startNode == null) {
+ return sortKeysResult;
+ }
+ if (startNode.data != null) {
+ sortKeysResult.addElement(getKey(startNode));
+ }
+ return sortKeysRecursion(startNode.relatives[TSTNode.EQKID],
+ ((numReturnValues < 0) ? -1 : numReturnValues), sortKeysResult);
+ }
+
+ /**
+ * Returns the number of nodes in the trie that have non-null data.
+ *
+ *@return The number of nodes in the trie that have non-null data.
+ */
+ public int numDataNodes() {
+ return numDataNodes(rootNode);
+ }
+
+ /**
+ * Returns the number of nodes in the subtrie below and including the starting
+ * node. The method counts only nodes that have non-null data.
+ *
+ *@param startingNode
+ * The top node of the subtrie. the node that defines the subtrie.
+ *@return The total number of nodes in the subtrie.
+ */
+ protected int numDataNodes(TSTNode startingNode) {
+ return recursiveNodeCalculator(startingNode, true, 0);
+ }
+
+ /**
+ * Returns the total number of nodes in the trie. The method counts nodes
+ * whether or not they have data.
+ *
+ *@return The total number of nodes in the trie.
+ */
+ public int numNodes() {
+ return numNodes(rootNode);
+ }
+
+ /**
+ * Returns the total number of nodes in the subtrie below and including the
+ * starting Node. The method counts nodes whether or not they have data.
+ *
+ *@param startingNode
+ * The top node of the subtrie. The node that defines the subtrie.
+ *@return The total number of nodes in the subtrie.
+ */
+ protected int numNodes(TSTNode startingNode) {
+ return recursiveNodeCalculator(startingNode, false, 0);
+ }
+
+ /**
+ * Stores a value in the trie. The value may be retrieved using the key.
+ *
+ *@param key
+ * A String that indexes the object to be stored.
+ *@param value
+ * The object to be stored in the Trie.
+ */
+ public void put(String key, Object value) {
+ getOrCreateNode(key.trim().toLowerCase()).data = value;
+ }
+
+ /**
+ * Recursivelly visists each node to calculate the number of nodes.
+ *
+ *@param currentNode
+ * The current node.
+ *@param checkData
+ * If true we check the data to be different of null.
+ *@param numNodes2
+ * The number of nodes so far.
+ *@return The number of nodes accounted.
+ */
+ private int recursiveNodeCalculator(TSTNode currentNode, boolean checkData,
+ int numNodes2) {
+ if (currentNode == null) {
+ return numNodes2;
+ }
+ int numNodes = recursiveNodeCalculator(
+ currentNode.relatives[TSTNode.LOKID], checkData, numNodes2);
+ numNodes = recursiveNodeCalculator(currentNode.relatives[TSTNode.EQKID],
+ checkData, numNodes);
+ numNodes = recursiveNodeCalculator(currentNode.relatives[TSTNode.HIKID],
+ checkData, numNodes);
+ if (checkData) {
+ if (currentNode.data != null) {
+ numNodes++;
+ }
+ } else {
+ numNodes++;
+ }
+ return numNodes;
+ }
+
+ /**
+ * Removes the value indexed by key. Also removes all nodes that are rendered
+ * unnecessary by the removal of this data.
+ *
+ *@param key
+ * A string that indexes the object to be removed from
+ * the Trie.
+ */
+ public void remove(String key) {
+ deleteNode(getNode(key.trim().toLowerCase()));
+ }
+
+ /**
+ * Sets the number of characters by which words can differ from target word
+ * when calling the matchAlmost method.
+ *
+ * Arguments less than 0 will set the char difference to 0, and arguments
+ * greater than 3 will set the char difference to 3.
+ *
+ *@param diff
+ * The number of characters by which words can differ from target
+ * word.
+ */
+ public void setMatchAlmostDiff(int diff) {
+ if (diff < 0) {
+ matchAlmostDiff = 0;
+ } else if (diff > 3) {
+ matchAlmostDiff = 3;
+ } else {
+ matchAlmostDiff = diff;
+ }
+ }
+
+ /**
+ * Sets the default maximum number of values returned from the
+ * matchPrefix and matchAlmost methods.
+ *
+ * The value should be set this to -1 to get an unlimited number of return
+ * values. note that the methods mentioned above provide overloaded versions
+ * that allow you to specify the maximum number of return values, in which
+ * case this value is temporarily overridden.
+ *
+ **@param num
+ * The number of values that will be returned when calling the
+ * methods above.
+ */
+ public void setNumReturnValues(int num) {
+ defaultNumReturnValues = (num < 0) ? -1 : num;
+ }
+
+ /**
+ * Returns keys sorted in alphabetical order. This includes the start Node and
+ * all nodes connected to the start Node.
+ *
+ * The number of keys returned is limited to numReturnValues. To get a list
+ * that isn't limited in size, set numReturnValues to -1.
+ *
+ *@param startNode
+ * The top node defining the subtrie to be searched.
+ *@param numReturnValues
+ * The maximum number of values returned from this method.
+ *@return A List with the results.
+ */
+ protected List sortKeys(TSTNode startNode, int numReturnValues) {
+ return sortKeysRecursion(startNode, ((numReturnValues < 0) ? -1
+ : numReturnValues), new Vector());
+ }
+
+ /**
+ * Returns keys sorted in alphabetical order. This includes the current Node
+ * and all nodes connected to the current Node.
+ *
+ * Sorted keys will be appended to the end of the resulting List.
+ * The result may be empty when this method is invoked, but may not be
+ * null.
+ *
+ *@param currentNode
+ * The current node.
+ *@param sortKeysNumReturnValues
+ * The maximum number of values in the result.
+ *@param sortKeysResult2
+ * The results so far.
+ *@return A List with the results.
+ */
+ private List sortKeysRecursion(TSTNode currentNode,
+ int sortKeysNumReturnValues, List sortKeysResult2) {
+ if (currentNode == null) {
+ return sortKeysResult2;
+ }
+ List sortKeysResult = sortKeysRecursion(
+ currentNode.relatives[TSTNode.LOKID], sortKeysNumReturnValues,
+ sortKeysResult2);
+ if (sortKeysNumReturnValues != -1
+ && sortKeysResult.size() >= sortKeysNumReturnValues) {
+ return sortKeysResult;
+ }
+ if (currentNode.data != null) {
+ sortKeysResult.add(getKey(currentNode));
+ }
+ sortKeysResult = sortKeysRecursion(currentNode.relatives[TSTNode.EQKID],
+ sortKeysNumReturnValues, sortKeysResult);
+ return sortKeysRecursion(currentNode.relatives[TSTNode.HIKID],
+ sortKeysNumReturnValues, sortKeysResult);
+ }
+
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTAutocomplete.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTAutocomplete.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTAutocomplete.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTAutocomplete.java 2011-05-22 17:05:53.000000000 -0400
@@ -0,0 +1,142 @@
+package org.apache.lucene.search.suggest.tst;
+
+import java.util.*;
+
+public class TSTAutocomplete {
+
+ /**
+ * Inserting keys in TST in the order middle,small,big (lexicographic measure)
+ * recursively creates a balanced tree which reduces insertion and search
+ * times significantly.
+ *
+ * @param tokens
+ * Sorted list of keys to be inserted in TST.
+ * @param lo
+ * stores the lower index of current list.
+ * @param hi
+ * stores the higher index of current list.
+ * @param root
+ * a reference object to root of TST.
+ */
+ public void balancedTree(Object[] tokens, Object[] vals, int lo, int hi,
+ TernaryTreeNode root) {
+ if (lo > hi) return;
+ int mid = (lo + hi) / 2;
+ root = insert(root, (String) tokens[mid], vals[mid], 0);
+ balancedTree(tokens, vals, lo, mid - 1, root);
+ balancedTree(tokens, vals, mid + 1, hi, root);
+ }
+
+ /**
+ * Inserts a key in TST creating a series of Binary Search Trees at each node.
+ * The key is actually stored across the eqKid of each node in a successive
+ * manner.
+ *
+ * @param currentNode
+ * a reference node where the insertion will take currently.
+ * @param s
+ * key to be inserted in TST.
+ * @param x
+ * index of character in key to be inserted currently.
+ * @return currentNode The new reference to root node of TST
+ */
+ public TernaryTreeNode insert(TernaryTreeNode currentNode, String s,
+ Object val, int x) {
+ if (s == null || s.length() <= x) {
+ return currentNode;
+ }
+ if (currentNode == null) {
+ TernaryTreeNode newNode = new TernaryTreeNode();
+ newNode.splitchar = s.charAt(x);
+ currentNode = newNode;
+ if (x < s.length() - 1) {
+ currentNode.eqKid = insert(currentNode.eqKid, s, val, x + 1);
+ } else {
+ currentNode.token = s;
+ currentNode.val = val;
+ return currentNode;
+ }
+ } else if (currentNode.splitchar > s.charAt(x)) {
+ currentNode.loKid = insert(currentNode.loKid, s, val, x);
+ } else if (currentNode.splitchar == s.charAt(x)) {
+ if (x < s.length() - 1) {
+ currentNode.eqKid = insert(currentNode.eqKid, s, val, x + 1);
+ } else {
+ currentNode.token = s;
+ currentNode.val = val;
+ return currentNode;
+ }
+ } else {
+ currentNode.hiKid = insert(currentNode.hiKid, s, val, x);
+ }
+ return currentNode;
+ }
+
+ /**
+ * Auto-completes a given prefix query using Depth-First Search with the end
+ * of prefix as source node each time finding a new leaf to get a complete key
+ * to be added in the suggest list.
+ *
+ * @param root
+ * a reference to root node of TST.
+ * @param s
+ * prefix query to be auto-completed.
+ * @param x
+ * index of current character to be searched while traversing through
+ * the prefix in TST.
+ * @return suggest list of auto-completed keys for the given prefix query.
+ */
+ public ArrayList prefixCompletion(TernaryTreeNode root,
+ String s, int x) {
+
+ TernaryTreeNode p = root;
+ ArrayList suggest = new ArrayList();
+
+ while (p != null) {
+ if (s.charAt(x) < p.splitchar) {
+ p = p.loKid;
+ } else if (s.charAt(x) == p.splitchar) {
+ if (x == s.length() - 1) {
+ break;
+ } else {
+ x++;
+ }
+ p = p.eqKid;
+ } else {
+ p = p.hiKid;
+ }
+ }
+
+ if (p == null) return suggest;
+ if (p.eqKid == null && p.token == null) return suggest;
+ if (p.eqKid == null && p.token != null) {
+ suggest.add(p);
+ return suggest;
+ }
+
+ if (p.token != null) {
+ suggest.add(p);
+ }
+ p = p.eqKid;
+
+ Stack st = new Stack();
+ st.push(p);
+ while (!st.empty()) {
+ TernaryTreeNode top = st.peek();
+ st.pop();
+ if (top.token != null) {
+ suggest.add(top);
+ }
+ if (top.eqKid != null) {
+ st.push(top.eqKid);
+ }
+ if (top.loKid != null) {
+ st.push(top.loKid);
+ }
+ if (top.hiKid != null) {
+ st.push(top.hiKid);
+ }
+ }
+ return suggest;
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java 2011-05-22 18:07:54.000000000 -0400
@@ -0,0 +1,174 @@
+package org.apache.lucene.search.suggest.tst;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
+import org.apache.lucene.search.spell.SortedIterator;
+import org.apache.lucene.search.spell.TermFreqIterator;
+
+public class TSTLookup extends Lookup {
+ TernaryTreeNode root = new TernaryTreeNode();
+ TSTAutocomplete autocomplete = new TSTAutocomplete();
+
+ @Override
+ public void build(TermFreqIterator tfit) throws IOException {
+ root = new TernaryTreeNode();
+ // buffer first
+ if (!(tfit instanceof SortedIterator)) {
+ // make sure it's sorted
+ tfit = new SortedTermFreqIteratorWrapper(tfit);
+ }
+
+ ArrayList tokens = new ArrayList();
+ ArrayList vals = new ArrayList();
+ while (tfit.hasNext()) {
+ tokens.add(tfit.next());
+ vals.add(new Float(tfit.freq()));
+ }
+ autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
+ }
+
+ @Override
+ public boolean add(String key, Object value) {
+ autocomplete.insert(root, key, value, 0);
+ // XXX we don't know if a new node was created
+ return true;
+ }
+
+ @Override
+ public Object get(String key) {
+ List list = autocomplete.prefixCompletion(root, key, 0);
+ if (list == null || list.isEmpty()) {
+ return null;
+ }
+ for (TernaryTreeNode n : list) {
+ if (n.token.equals(key)) {
+ return n.val;
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public List lookup(String key, boolean onlyMorePopular, int num) {
+ List list = autocomplete.prefixCompletion(root, key, 0);
+ List res = new ArrayList();
+ if (list == null || list.size() == 0) {
+ return res;
+ }
+ int maxCnt = Math.min(num, list.size());
+ if (onlyMorePopular) {
+ LookupPriorityQueue queue = new LookupPriorityQueue(num);
+ for (TernaryTreeNode ttn : list) {
+ queue.insertWithOverflow(new LookupResult(ttn.token, (Float)ttn.val));
+ }
+ for (LookupResult lr : queue.getResults()) {
+ res.add(lr);
+ }
+ } else {
+ for (int i = 0; i < maxCnt; i++) {
+ TernaryTreeNode ttn = list.get(i);
+ res.add(new LookupResult(ttn.token, (Float)ttn.val));
+ }
+ }
+ return res;
+ }
+
+ public static final String FILENAME = "tst.dat";
+
+ private static final byte LO_KID = 0x01;
+ private static final byte EQ_KID = 0x02;
+ private static final byte HI_KID = 0x04;
+ private static final byte HAS_TOKEN = 0x08;
+ private static final byte HAS_VALUE = 0x10;
+
+ @Override
+ public synchronized boolean load(File storeDir) throws IOException {
+ File data = new File(storeDir, FILENAME);
+ if (!data.exists() || !data.canRead()) {
+ return false;
+ }
+ DataInputStream in = new DataInputStream(new FileInputStream(data));
+ root = new TernaryTreeNode();
+ try {
+ readRecursively(in, root);
+ } finally {
+ in.close();
+ }
+ return true;
+ }
+
+ // pre-order traversal
+ private void readRecursively(DataInputStream in, TernaryTreeNode node) throws IOException {
+ node.splitchar = in.readChar();
+ byte mask = in.readByte();
+ if ((mask & HAS_TOKEN) != 0) {
+ node.token = in.readUTF();
+ }
+ if ((mask & HAS_VALUE) != 0) {
+ node.val = new Float(in.readFloat());
+ }
+ if ((mask & LO_KID) != 0) {
+ node.loKid = new TernaryTreeNode();
+ readRecursively(in, node.loKid);
+ }
+ if ((mask & EQ_KID) != 0) {
+ node.eqKid = new TernaryTreeNode();
+ readRecursively(in, node.eqKid);
+ }
+ if ((mask & HI_KID) != 0) {
+ node.hiKid = new TernaryTreeNode();
+ readRecursively(in, node.hiKid);
+ }
+ }
+
+ @Override
+ public synchronized boolean store(File storeDir) throws IOException {
+ if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
+ return false;
+ }
+ File data = new File(storeDir, FILENAME);
+ DataOutputStream out = new DataOutputStream(new FileOutputStream(data));
+ try {
+ writeRecursively(out, root);
+ out.flush();
+ } finally {
+ out.close();
+ }
+ return true;
+ }
+
+ // pre-order traversal
+ private void writeRecursively(DataOutputStream out, TernaryTreeNode node) throws IOException {
+ // write out the current node
+ out.writeChar(node.splitchar);
+ // prepare a mask of kids
+ byte mask = 0;
+ if (node.eqKid != null) mask |= EQ_KID;
+ if (node.loKid != null) mask |= LO_KID;
+ if (node.hiKid != null) mask |= HI_KID;
+ if (node.token != null) mask |= HAS_TOKEN;
+ if (node.val != null) mask |= HAS_VALUE;
+ out.writeByte(mask);
+ if (node.token != null) out.writeUTF(node.token);
+ if (node.val != null) out.writeFloat((Float)node.val);
+ // recurse and write kids
+ if (node.loKid != null) {
+ writeRecursively(out, node.loKid);
+ }
+ if (node.eqKid != null) {
+ writeRecursively(out, node.eqKid);
+ }
+ if (node.hiKid != null) {
+ writeRecursively(out, node.hiKid);
+ }
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TernaryTreeNode.java lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TernaryTreeNode.java
--- lucene-clean-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TernaryTreeNode.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TernaryTreeNode.java 2011-05-22 17:05:45.000000000 -0400
@@ -0,0 +1,25 @@
+package org.apache.lucene.search.suggest.tst;
+
+/**
+ * The class creates a TST node.
+ */
+
+public class TernaryTreeNode {
+ /** the character stored by a node. */
+ char splitchar;
+ /** a reference object to the node containing character smaller than this node's character. */
+ TernaryTreeNode loKid;
+ /**
+ * a reference object to the node containing character next to this node's character as
+ * occurring in the inserted token.
+ */
+ TernaryTreeNode eqKid;
+ /** a reference object to the node containing character higher than this node's character. */
+ TernaryTreeNode hiKid;
+ /**
+ * used by leaf nodes to store the complete tokens to be added to suggest list while
+ * auto-completing the prefix.
+ */
+ String token;
+ Object val;
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,144 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.English;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestDirectSpellChecker extends LuceneTestCase {
+
+ public void testSimpleExamples() throws Exception {
+ DirectSpellChecker spellChecker = new DirectSpellChecker();
+ spellChecker.setMinQueryLength(0);
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir,
+ new MockAnalyzer(random, MockTokenizer.SIMPLE, true));
+
+ for (int i = 0; i < 20; i++) {
+ Document doc = new Document();
+ doc.add(newField("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ }
+
+ IndexReader ir = writer.getReader();
+
+ SuggestWord[] similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false);
+ assertTrue(similar.length > 0);
+ assertEquals("five", similar[0].string);
+
+ similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir, false);
+ if (similar.length > 0) {
+ assertFalse(similar[0].string.equals("five")); // don't suggest a word for itself
+ }
+
+ similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false);
+ assertTrue(similar.length > 0);
+ assertEquals("five", similar[0].string);
+
+ similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir, false);
+ assertTrue(similar.length > 0);
+ assertEquals("five", similar[0].string);
+
+ similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir, false);
+ assertTrue(similar.length > 0);
+ assertEquals("five", similar[0].string);
+
+ assertTrue(similar.length > 0);
+ similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir, false);
+ assertEquals("five", similar[0].string);
+
+ // add some more documents
+ for (int i = 1000; i < 1100; i++) {
+ Document doc = new Document();
+ doc.add(newField("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ }
+
+ ir.close();
+ ir = writer.getReader();
+
+ // look ma, no spellcheck index rebuild
+ similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10, ir, false);
+ assertTrue(similar.length > 0);
+ assertEquals("thousand", similar[0].string);
+
+ ir.close();
+ writer.close();
+ dir.close();
+ }
+
+ public void testOptions() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir,
+ new MockAnalyzer(random, MockTokenizer.SIMPLE, true));
+
+ Document doc = new Document();
+ doc.add(newField("text", "foobar", Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ doc.add(newField("text", "foobar", Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ doc.add(newField("text", "foobaz", Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ doc.add(newField("text", "fobar", Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+
+ IndexReader ir = writer.getReader();
+
+ DirectSpellChecker spellChecker = new DirectSpellChecker();
+ spellChecker.setMaxQueryFrequency(0F);
+ SuggestWord[] similar = spellChecker.suggestSimilar(new Term("text", "fobar"), 1, ir, true);
+ assertEquals(0, similar.length);
+
+ spellChecker = new DirectSpellChecker(); // reset defaults
+ spellChecker.setMinQueryLength(5);
+ similar = spellChecker.suggestSimilar(new Term("text", "foba"), 1, ir, true);
+ assertEquals(0, similar.length);
+
+ spellChecker = new DirectSpellChecker(); // reset defaults
+ spellChecker.setMaxEdits(1);
+ similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true);
+ assertEquals(0, similar.length);
+
+ spellChecker = new DirectSpellChecker(); // reset defaults
+ spellChecker.setAccuracy(0.9F);
+ similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true);
+ assertEquals(0, similar.length);
+
+ spellChecker = new DirectSpellChecker(); // reset defaults
+ spellChecker.setMinPrefix(0);
+ similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true);
+ assertEquals(1, similar.length);
+
+ spellChecker = new DirectSpellChecker(); // reset defaults
+ spellChecker.setMinPrefix(1);
+ similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true);
+ assertEquals(0, similar.length);
+
+ ir.close();
+ writer.close();
+ dir.close();
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,49 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestJaroWinklerDistance extends LuceneTestCase {
+
+ private StringDistance sd = new JaroWinklerDistance();
+
+ public void testGetDistance() {
+ float d = sd.getDistance("al", "al");
+ assertTrue(d == 1.0f);
+ d = sd.getDistance("martha", "marhta");
+ assertTrue(d > 0.961 && d <0.962);
+ d = sd.getDistance("jones", "johnson");
+ assertTrue(d > 0.832 && d < 0.833);
+ d = sd.getDistance("abcvwxyz", "cabvwxyz");
+ assertTrue(d > 0.958 && d < 0.959);
+ d = sd.getDistance("dwayne", "duane");
+ assertTrue(d > 0.84 && d < 0.841);
+ d = sd.getDistance("dixon", "dicksonx");
+ assertTrue(d > 0.813 && d < 0.814);
+ d = sd.getDistance("fvie", "ten");
+ assertTrue(d == 0f);
+ float d1 = sd.getDistance("zac ephron", "zac efron");
+ float d2 = sd.getDistance("zac ephron", "kai ephron");
+ assertTrue(d1 > d2);
+ d1 = sd.getDistance("brittney spears", "britney spears");
+ d2 = sd.getDistance("brittney spears", "brittney startzman");
+ assertTrue(d1 > d2);
+ }
+
+}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,54 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestLevenshteinDistance extends LuceneTestCase {
+
+ private StringDistance sd = new LevensteinDistance();
+
+ public void testGetDistance() {
+ float d = sd.getDistance("al", "al");
+ assertEquals(d,1.0f,0.001);
+ d = sd.getDistance("martha", "marhta");
+ assertEquals(d,0.6666,0.001);
+ d = sd.getDistance("jones", "johnson");
+ assertEquals(d,0.4285,0.001);
+ d = sd.getDistance("abcvwxyz", "cabvwxyz");
+ assertEquals(d,0.75,0.001);
+ d = sd.getDistance("dwayne", "duane");
+ assertEquals(d,0.666,0.001);
+ d = sd.getDistance("dixon", "dicksonx");
+ assertEquals(d,0.5,0.001);
+ d = sd.getDistance("six", "ten");
+ assertEquals(d,0,0.001);
+ float d1 = sd.getDistance("zac ephron", "zac efron");
+ float d2 = sd.getDistance("zac ephron", "kai ephron");
+ assertEquals(d1,d2,0.001);
+ d1 = sd.getDistance("brittney spears", "britney spears");
+ d2 = sd.getDistance("brittney spears", "brittney startzman");
+ assertTrue(d1 > d2);
+ }
+
+ public void testEmpty() throws Exception {
+ float d = sd.getDistance("", "al");
+ assertEquals(d,0.0f,0.001);
+ }
+
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,210 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Test case for LuceneDictionary.
+ * It first creates a simple index and then a couple of instances of LuceneDictionary
+ * on different fields and checks if all the right text comes back.
+ */
+public class TestLuceneDictionary extends LuceneTestCase {
+
+ private Directory store;
+
+ private IndexReader indexReader = null;
+ private LuceneDictionary ld;
+ private Iterator it;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ store = newDirectory();
+ IndexWriter writer = new IndexWriter(store, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
+
+ Document doc;
+
+ doc = new Document();
+ doc.add(newField("aaa", "foo", Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+
+ doc = new Document();
+ doc.add(newField("aaa", "foo", Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+
+ doc = new Document();
+ doc.add(new Field("contents", "Tom", Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+
+ doc = new Document();
+ doc.add(new Field("contents", "Jerry", Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+
+ doc = new Document();
+ doc.add(newField("zzz", "bar", Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+
+ writer.optimize();
+ writer.close();
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ if (indexReader != null)
+ indexReader.close();
+ store.close();
+ super.tearDown();
+ }
+
+ public void testFieldNonExistent() throws IOException {
+ try {
+ indexReader = IndexReader.open(store, true);
+
+ ld = new LuceneDictionary(indexReader, "nonexistent_field");
+ it = ld.getWordsIterator();
+
+ assertFalse("More elements than expected", it.hasNext());
+ assertTrue("Nonexistent element is really null", it.next() == null);
+ } finally {
+ if (indexReader != null) { indexReader.close(); }
+ }
+ }
+
+ public void testFieldAaa() throws IOException {
+ try {
+ indexReader = IndexReader.open(store, true);
+
+ ld = new LuceneDictionary(indexReader, "aaa");
+ it = ld.getWordsIterator();
+
+ assertTrue("First element doesn't exist.", it.hasNext());
+ assertTrue("First element isn't correct", it.next().equals("foo"));
+ assertFalse("More elements than expected", it.hasNext());
+ assertTrue("Nonexistent element is really null", it.next() == null);
+ } finally {
+ if (indexReader != null) { indexReader.close(); }
+ }
+ }
+
+ public void testFieldContents_1() throws IOException {
+ try {
+ indexReader = IndexReader.open(store, true);
+
+ ld = new LuceneDictionary(indexReader, "contents");
+ it = ld.getWordsIterator();
+
+ assertTrue("First element doesn't exist.", it.hasNext());
+ assertTrue("First element isn't correct", it.next().equals("Jerry"));
+ assertTrue("Second element doesn't exist.", it.hasNext());
+ assertTrue("Second element isn't correct", it.next().equals("Tom"));
+ assertFalse("More elements than expected", it.hasNext());
+ assertTrue("Nonexistent element is really null", it.next() == null);
+
+ ld = new LuceneDictionary(indexReader, "contents");
+ it = ld.getWordsIterator();
+
+ int counter = 2;
+ while (it.hasNext()) {
+ it.next();
+ counter--;
+ }
+
+ assertTrue("Number of words incorrect", counter == 0);
+ }
+ finally {
+ if (indexReader != null) { indexReader.close(); }
+ }
+ }
+
+ public void testFieldContents_2() throws IOException {
+ try {
+ indexReader = IndexReader.open(store, true);
+
+ ld = new LuceneDictionary(indexReader, "contents");
+ it = ld.getWordsIterator();
+
+ // hasNext() should have no side effects
+ assertTrue("First element isn't were it should be.", it.hasNext());
+ assertTrue("First element isn't were it should be.", it.hasNext());
+ assertTrue("First element isn't were it should be.", it.hasNext());
+
+ // just iterate through words
+ assertTrue("First element isn't correct", it.next().equals("Jerry"));
+ assertTrue("Second element isn't correct", it.next().equals("Tom"));
+ assertTrue("Nonexistent element is really null", it.next() == null);
+
+ // hasNext() should still have no side effects ...
+ assertFalse("There should be any more elements", it.hasNext());
+ assertFalse("There should be any more elements", it.hasNext());
+ assertFalse("There should be any more elements", it.hasNext());
+
+ // .. and there are really no more words
+ assertTrue("Nonexistent element is really null", it.next() == null);
+ assertTrue("Nonexistent element is really null", it.next() == null);
+ assertTrue("Nonexistent element is really null", it.next() == null);
+ }
+ finally {
+ if (indexReader != null) { indexReader.close(); }
+ }
+ }
+
+ public void testFieldZzz() throws IOException {
+ try {
+ indexReader = IndexReader.open(store, true);
+
+ ld = new LuceneDictionary(indexReader, "zzz");
+ it = ld.getWordsIterator();
+
+ assertTrue("First element doesn't exist.", it.hasNext());
+ assertTrue("First element isn't correct", it.next().equals("bar"));
+ assertFalse("More elements than expected", it.hasNext());
+ assertTrue("Nonexistent element is really null", it.next() == null);
+ }
+ finally {
+ if (indexReader != null) { indexReader.close(); }
+ }
+ }
+
+ public void testSpellchecker() throws IOException {
+ Directory dir = newDirectory();
+ SpellChecker sc = new SpellChecker(dir);
+ indexReader = IndexReader.open(store, true);
+ sc.indexDictionary(new LuceneDictionary(indexReader, "contents"));
+ String[] suggestions = sc.suggestSimilar("Tam", 1);
+ assertEquals(1, suggestions.length);
+ assertEquals("Tom", suggestions[0]);
+ suggestions = sc.suggestSimilar("Jarry", 1);
+ assertEquals(1, suggestions.length);
+ assertEquals("Jerry", suggestions[0]);
+ indexReader.close();
+ sc.close();
+ dir.close();
+ }
+
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestNGramDistance.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestNGramDistance.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestNGramDistance.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestNGramDistance.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,132 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestNGramDistance extends LuceneTestCase {
+
+
+
+ public void testGetDistance1() {
+ StringDistance nsd = new NGramDistance(1);
+ float d = nsd.getDistance("al", "al");
+ assertEquals(d,1.0f,0.001);
+ d = nsd.getDistance("a", "a");
+ assertEquals(d,1.0f,0.001);
+ d = nsd.getDistance("b", "a");
+ assertEquals(d,0.0f,0.001);
+ d = nsd.getDistance("martha", "marhta");
+ assertEquals(d,0.6666,0.001);
+ d = nsd.getDistance("jones", "johnson");
+ assertEquals(d,0.4285,0.001);
+ d = nsd.getDistance("natural", "contrary");
+ assertEquals(d,0.25,0.001);
+ d = nsd.getDistance("abcvwxyz", "cabvwxyz");
+ assertEquals(d,0.75,0.001);
+ d = nsd.getDistance("dwayne", "duane");
+ assertEquals(d,0.666,0.001);
+ d = nsd.getDistance("dixon", "dicksonx");
+ assertEquals(d,0.5,0.001);
+ d = nsd.getDistance("six", "ten");
+ assertEquals(d,0,0.001);
+ float d1 = nsd.getDistance("zac ephron", "zac efron");
+ float d2 = nsd.getDistance("zac ephron", "kai ephron");
+ assertEquals(d1,d2,0.001);
+ d1 = nsd.getDistance("brittney spears", "britney spears");
+ d2 = nsd.getDistance("brittney spears", "brittney startzman");
+ assertTrue(d1 > d2);
+ d1 = nsd.getDistance("12345678", "12890678");
+ d2 = nsd.getDistance("12345678", "72385698");
+ assertEquals(d1,d2,001);
+ }
+
+ public void testGetDistance2() {
+ StringDistance sd = new NGramDistance(2);
+ float d = sd.getDistance("al", "al");
+ assertEquals(d,1.0f,0.001);
+ d = sd.getDistance("a", "a");
+ assertEquals(d,1.0f,0.001);
+ d = sd.getDistance("b", "a");
+ assertEquals(d,0.0f,0.001);
+ d = sd.getDistance("a", "aa");
+ assertEquals(d,0.5f,0.001);
+ d = sd.getDistance("martha", "marhta");
+ assertEquals(d,0.6666,0.001);
+ d = sd.getDistance("jones", "johnson");
+ assertEquals(d,0.4285,0.001);
+ d = sd.getDistance("natural", "contrary");
+ assertEquals(d,0.25,0.001);
+ d = sd.getDistance("abcvwxyz", "cabvwxyz");
+ assertEquals(d,0.625,0.001);
+ d = sd.getDistance("dwayne", "duane");
+ assertEquals(d,0.5833,0.001);
+ d = sd.getDistance("dixon", "dicksonx");
+ assertEquals(d,0.5,0.001);
+ d = sd.getDistance("six", "ten");
+ assertEquals(d,0,0.001);
+ float d1 = sd.getDistance("zac ephron", "zac efron");
+ float d2 = sd.getDistance("zac ephron", "kai ephron");
+ assertTrue(d1 > d2);
+ d1 = sd.getDistance("brittney spears", "britney spears");
+ d2 = sd.getDistance("brittney spears", "brittney startzman");
+ assertTrue(d1 > d2);
+ d1 = sd.getDistance("0012345678", "0012890678");
+ d2 = sd.getDistance("0012345678", "0072385698");
+ assertEquals(d1,d2,0.001);
+ }
+
+ public void testGetDistance3() {
+ StringDistance sd = new NGramDistance(3);
+ float d = sd.getDistance("al", "al");
+ assertEquals(d,1.0f,0.001);
+ d = sd.getDistance("a", "a");
+ assertEquals(d,1.0f,0.001);
+ d = sd.getDistance("b", "a");
+ assertEquals(d,0.0f,0.001);
+ d = sd.getDistance("martha", "marhta");
+ assertEquals(d,0.7222,0.001);
+ d = sd.getDistance("jones", "johnson");
+ assertEquals(d,0.4762,0.001);
+ d = sd.getDistance("natural", "contrary");
+ assertEquals(d,0.2083,0.001);
+ d = sd.getDistance("abcvwxyz", "cabvwxyz");
+ assertEquals(d,0.5625,0.001);
+ d = sd.getDistance("dwayne", "duane");
+ assertEquals(d,0.5277,0.001);
+ d = sd.getDistance("dixon", "dicksonx");
+ assertEquals(d,0.4583,0.001);
+ d = sd.getDistance("six", "ten");
+ assertEquals(d,0,0.001);
+ float d1 = sd.getDistance("zac ephron", "zac efron");
+ float d2 = sd.getDistance("zac ephron", "kai ephron");
+ assertTrue(d1 > d2);
+ d1 = sd.getDistance("brittney spears", "britney spears");
+ d2 = sd.getDistance("brittney spears", "brittney startzman");
+ assertTrue(d1 > d2);
+ d1 = sd.getDistance("0012345678", "0012890678");
+ d2 = sd.getDistance("0012345678", "0072385698");
+ assertTrue(d1 < d2);
+ }
+
+ public void testEmpty() throws Exception {
+ StringDistance nsd = new NGramDistance(1);
+ float d = nsd.getDistance("", "al");
+ assertEquals(d,0.0f,0.001);
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestPlainTextDictionary.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,47 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Test case for PlainTextDictionary
+ *
+ */
+public class TestPlainTextDictionary extends LuceneTestCase {
+
+ public void testBuild() throws IOException {
+ final String LF = System.getProperty("line.separator");
+ String input = "oneword" + LF + "twoword" + LF + "threeword";
+ PlainTextDictionary ptd = new PlainTextDictionary(new StringReader(input));
+ Directory ramDir = newDirectory();
+ SpellChecker spellChecker = new SpellChecker(ramDir);
+ spellChecker.indexDictionary(ptd);
+ String[] similar = spellChecker.suggestSimilar("treeword", 2);
+ assertEquals(2, similar.length);
+ assertEquals(similar[0], "threeword");
+ assertEquals(similar[1], "oneword");
+ spellChecker.close();
+ ramDir.close();
+ }
+
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java 2011-05-22 16:44:12.000000000 -0400
@@ -0,0 +1,438 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.AlreadyClosedException;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.English;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Spell checker test case
+ */
+public class TestSpellChecker extends LuceneTestCase {
+ private SpellCheckerMock spellChecker;
+ private Directory userindex, spellindex;
+ private List searchers;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+
+ //create a user index
+ userindex = newDirectory();
+ IndexWriter writer = new IndexWriter(userindex, new IndexWriterConfig(
+ TEST_VERSION_CURRENT, new MockAnalyzer(random)));
+
+ for (int i = 0; i < 1000; i++) {
+ Document doc = new Document();
+ doc.add(newField("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(newField("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
+ doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
+ writer.addDocument(doc);
+ }
+ writer.close();
+ searchers = Collections.synchronizedList(new ArrayList());
+ // create the spellChecker
+ spellindex = newDirectory();
+ spellChecker = new SpellCheckerMock(spellindex);
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ userindex.close();
+ if (!spellChecker.isClosed())
+ spellChecker.close();
+ spellindex.close();
+ super.tearDown();
+ }
+
+
+ public void testBuild() throws CorruptIndexException, IOException {
+ IndexReader r = IndexReader.open(userindex, true);
+
+ spellChecker.clearIndex();
+
+ addwords(r, spellChecker, "field1");
+ int num_field1 = this.numdoc();
+
+ addwords(r, spellChecker, "field2");
+ int num_field2 = this.numdoc();
+
+ assertEquals(num_field2, num_field1 + 1);
+
+ assertLastSearcherOpen(4);
+
+ checkCommonSuggestions(r);
+ checkLevenshteinSuggestions(r);
+
+ spellChecker.setStringDistance(new JaroWinklerDistance());
+ spellChecker.setAccuracy(0.8f);
+ checkCommonSuggestions(r);
+ checkJaroWinklerSuggestions();
+ // the accuracy is set to 0.8 by default, but the best result has a score of 0.925
+ String[] similar = spellChecker.suggestSimilar("fvie", 2, 0.93f);
+ assertTrue(similar.length == 0);
+ similar = spellChecker.suggestSimilar("fvie", 2, 0.92f);
+ assertTrue(similar.length == 1);
+
+ similar = spellChecker.suggestSimilar("fiv", 2);
+ assertTrue(similar.length > 0);
+ assertEquals(similar[0], "five");
+
+ spellChecker.setStringDistance(new NGramDistance(2));
+ spellChecker.setAccuracy(0.5f);
+ checkCommonSuggestions(r);
+ checkNGramSuggestions();
+
+ r.close();
+ }
+
+ public void testComparator() throws Exception {
+ IndexReader r = IndexReader.open(userindex, true);
+ Directory compIdx = newDirectory();
+ SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
+ addwords(r, compareSP, "field3");
+
+ String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", false);
+ assertTrue(similar.length == 2);
+ //five and fvei have the same score, but different frequencies.
+ assertEquals("fvei", similar[0]);
+ assertEquals("five", similar[1]);
+ r.close();
+ if (!compareSP.isClosed())
+ compareSP.close();
+ compIdx.close();
+ }
+
+ private void checkCommonSuggestions(IndexReader r) throws IOException {
+ String[] similar = spellChecker.suggestSimilar("fvie", 2);
+ assertTrue(similar.length > 0);
+ assertEquals(similar[0], "five");
+
+ similar = spellChecker.suggestSimilar("five", 2);
+ if (similar.length > 0) {
+ assertFalse(similar[0].equals("five")); // don't suggest a word for itself
+ }
+
+ similar = spellChecker.suggestSimilar("fiv", 2);
+ assertTrue(similar.length > 0);
+ assertEquals(similar[0], "five");
+
+ similar = spellChecker.suggestSimilar("fives", 2);
+ assertTrue(similar.length > 0);
+ assertEquals(similar[0], "five");
+
+ assertTrue(similar.length > 0);
+ similar = spellChecker.suggestSimilar("fie", 2);
+ assertEquals(similar[0], "five");
+
+ // test restraint to a field
+ similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
+ assertEquals(0, similar.length); // there isn't the term thousand in the field field1
+
+ similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
+ assertEquals(1, similar.length); // there is the term thousand in the field field2
+ }
+
+ private void checkLevenshteinSuggestions(IndexReader r) throws IOException {
+ // test small word
+ String[] similar = spellChecker.suggestSimilar("fvie", 2);
+ assertEquals(1, similar.length);
+ assertEquals(similar[0], "five");
+
+ similar = spellChecker.suggestSimilar("five", 2);
+ assertEquals(1, similar.length);
+ assertEquals(similar[0], "nine"); // don't suggest a word for itself
+
+ similar = spellChecker.suggestSimilar("fiv", 2);
+ assertEquals(1, similar.length);
+ assertEquals(similar[0], "five");
+
+ similar = spellChecker.suggestSimilar("ive", 2);
+ assertEquals(2, similar.length);
+ assertEquals(similar[0], "five");
+ assertEquals(similar[1], "nine");
+
+ similar = spellChecker.suggestSimilar("fives", 2);
+ assertEquals(1, similar.length);
+ assertEquals(similar[0], "five");
+
+ similar = spellChecker.suggestSimilar("fie", 2);
+ assertEquals(2, similar.length);
+ assertEquals(similar[0], "five");
+ assertEquals(similar[1], "nine");
+
+ similar = spellChecker.suggestSimilar("fi", 2);
+ assertEquals(1, similar.length);
+ assertEquals(similar[0], "five");
+
+ // test restraint to a field
+ similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
+ assertEquals(0, similar.length); // there isn't the term thousand in the field field1
+
+ similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
+ assertEquals(1, similar.length); // there is the term thousand in the field field2
+
+ similar = spellChecker.suggestSimilar("onety", 2);
+ assertEquals(2, similar.length);
+ assertEquals(similar[0], "ninety");
+ assertEquals(similar[1], "one");
+ try {
+ similar = spellChecker.suggestSimilar("tousand", 10, r, null, false);
+ } catch (NullPointerException e) {
+ assertTrue("threw an NPE, and it shouldn't have", false);
+ }
+ }
+
+ private void checkJaroWinklerSuggestions() throws IOException {
+ String[] similar = spellChecker.suggestSimilar("onety", 2);
+ assertEquals(2, similar.length);
+ assertEquals(similar[0], "one");
+ assertEquals(similar[1], "ninety");
+ }
+
+ private void checkNGramSuggestions() throws IOException {
+ String[] similar = spellChecker.suggestSimilar("onety", 2);
+ assertEquals(2, similar.length);
+ assertEquals(similar[0], "one");
+ assertEquals(similar[1], "ninety");
+ }
+
+ private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException {
+ long time = System.currentTimeMillis();
+ sc.indexDictionary(new LuceneDictionary(r, field));
+ time = System.currentTimeMillis() - time;
+ //System.out.println("time to build " + field + ": " + time);
+ }
+
+ private int numdoc() throws IOException {
+ IndexReader rs = IndexReader.open(spellindex, true);
+ int num = rs.numDocs();
+ assertTrue(num != 0);
+ //System.out.println("num docs: " + num);
+ rs.close();
+ return num;
+ }
+
+ public void testClose() throws IOException {
+ IndexReader r = IndexReader.open(userindex, true);
+ spellChecker.clearIndex();
+ String field = "field1";
+ addwords(r, spellChecker, "field1");
+ int num_field1 = this.numdoc();
+ addwords(r, spellChecker, "field2");
+ int num_field2 = this.numdoc();
+ assertEquals(num_field2, num_field1 + 1);
+ checkCommonSuggestions(r);
+ assertLastSearcherOpen(4);
+ spellChecker.close();
+ assertSearchersClosed();
+ try {
+ spellChecker.close();
+ fail("spellchecker was already closed");
+ } catch (AlreadyClosedException e) {
+ // expected
+ }
+ try {
+ checkCommonSuggestions(r);
+ fail("spellchecker was already closed");
+ } catch (AlreadyClosedException e) {
+ // expected
+ }
+
+ try {
+ spellChecker.clearIndex();
+ fail("spellchecker was already closed");
+ } catch (AlreadyClosedException e) {
+ // expected
+ }
+
+ try {
+ spellChecker.indexDictionary(new LuceneDictionary(r, field));
+ fail("spellchecker was already closed");
+ } catch (AlreadyClosedException e) {
+ // expected
+ }
+
+ try {
+ spellChecker.setSpellIndex(spellindex);
+ fail("spellchecker was already closed");
+ } catch (AlreadyClosedException e) {
+ // expected
+ }
+ assertEquals(4, searchers.size());
+ assertSearchersClosed();
+ r.close();
+ }
+
+ /*
+ * tests if the internally shared indexsearcher is correctly closed
+ * when the spellchecker is concurrently accessed and closed.
+ */
+ public void testConcurrentAccess() throws IOException, InterruptedException {
+ assertEquals(1, searchers.size());
+ final IndexReader r = IndexReader.open(userindex, true);
+ spellChecker.clearIndex();
+ assertEquals(2, searchers.size());
+ addwords(r, spellChecker, "field1");
+ assertEquals(3, searchers.size());
+ int num_field1 = this.numdoc();
+ addwords(r, spellChecker, "field2");
+ assertEquals(4, searchers.size());
+ int num_field2 = this.numdoc();
+ assertEquals(num_field2, num_field1 + 1);
+ int numThreads = 5 + this.random.nextInt(5);
+ ExecutorService executor = Executors.newFixedThreadPool(numThreads);
+ SpellCheckWorker[] workers = new SpellCheckWorker[numThreads];
+ for (int i = 0; i < numThreads; i++) {
+ SpellCheckWorker spellCheckWorker = new SpellCheckWorker(r);
+ executor.execute(spellCheckWorker);
+ workers[i] = spellCheckWorker;
+
+ }
+ int iterations = 5 + random.nextInt(5);
+ for (int i = 0; i < iterations; i++) {
+ Thread.sleep(100);
+ // concurrently reset the spell index
+ spellChecker.setSpellIndex(this.spellindex);
+ // for debug - prints the internal open searchers
+ // showSearchersOpen();
+ }
+
+ spellChecker.close();
+ executor.shutdown();
+ // wait for 60 seconds - usually this is very fast but coverage runs could take quite long
+ executor.awaitTermination(60L, TimeUnit.SECONDS);
+
+ for (int i = 0; i < workers.length; i++) {
+ assertFalse(String.format("worker thread %d failed", i), workers[i].failed);
+ assertTrue(String.format("worker thread %d is still running but should be terminated", i), workers[i].terminated);
+ }
+ // 4 searchers more than iterations
+ // 1. at creation
+ // 2. clearIndex()
+ // 2. and 3. during addwords
+ assertEquals(iterations + 4, searchers.size());
+ assertSearchersClosed();
+ r.close();
+ }
+
+ private void assertLastSearcherOpen(int numSearchers) {
+ assertEquals(numSearchers, searchers.size());
+ IndexSearcher[] searcherArray = searchers.toArray(new IndexSearcher[0]);
+ for (int i = 0; i < searcherArray.length; i++) {
+ if (i == searcherArray.length - 1) {
+ assertTrue("expected last searcher open but was closed",
+ searcherArray[i].getIndexReader().getRefCount() > 0);
+ } else {
+ assertFalse("expected closed searcher but was open - Index: " + i,
+ searcherArray[i].getIndexReader().getRefCount() > 0);
+ }
+ }
+ }
+
+ private void assertSearchersClosed() {
+ for (IndexSearcher searcher : searchers) {
+ assertEquals(0, searcher.getIndexReader().getRefCount());
+ }
+ }
+
+ // For debug
+// private void showSearchersOpen() {
+// int count = 0;
+// for (IndexSearcher searcher : searchers) {
+// if(searcher.getIndexReader().getRefCount() > 0)
+// ++count;
+// }
+// System.out.println(count);
+// }
+
+
+ private class SpellCheckWorker implements Runnable {
+ private final IndexReader reader;
+ volatile boolean terminated = false;
+ volatile boolean failed = false;
+
+ SpellCheckWorker(IndexReader reader) {
+ super();
+ this.reader = reader;
+ }
+
+ public void run() {
+ try {
+ while (true) {
+ try {
+ checkCommonSuggestions(reader);
+ } catch (AlreadyClosedException e) {
+
+ return;
+ } catch (Throwable e) {
+
+ e.printStackTrace();
+ failed = true;
+ return;
+ }
+ }
+ } finally {
+ terminated = true;
+ }
+ }
+
+ }
+
+ class SpellCheckerMock extends SpellChecker {
+ public SpellCheckerMock(Directory spellIndex) throws IOException {
+ super(spellIndex);
+ }
+
+ public SpellCheckerMock(Directory spellIndex, StringDistance sd)
+ throws IOException {
+ super(spellIndex, sd);
+ }
+
+ public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException {
+ super(spellIndex, sd, comparator);
+ }
+
+ @Override
+ IndexSearcher createSearcher(Directory dir) throws IOException {
+ IndexSearcher searcher = super.createSearcher(dir);
+ TestSpellChecker.this.searchers.add(searcher);
+ return searcher;
+ }
+ }
+
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/Average.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/Average.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/Average.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/Average.java 2011-05-22 18:51:32.000000000 -0400
@@ -0,0 +1,70 @@
+package org.apache.lucene.search.suggest;
+
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Average with standard deviation.
+ */
+final class Average
+{
+ /**
+ * Average (in milliseconds).
+ */
+ public final double avg;
+
+ /**
+ * Standard deviation (in milliseconds).
+ */
+ public final double stddev;
+
+ /**
+ *
+ */
+ Average(double avg, double stddev)
+ {
+ this.avg = avg;
+ this.stddev = stddev;
+ }
+
+ public String toString()
+ {
+ return String.format(Locale.ENGLISH, "%.0f [+- %.2f]",
+ avg, stddev);
+ }
+
+ static Average from(List values)
+ {
+ double sum = 0;
+ double sumSquares = 0;
+
+ for (double l : values)
+ {
+ sum += l;
+ sumSquares += l * l;
+ }
+
+ double avg = sum / (double) values.size();
+ return new Average(
+ (sum / (double) values.size()),
+ Math.sqrt(sumSquares / (double) values.size() - avg * avg));
+ }
+}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java 2011-05-22 19:05:43.000000000 -0400
@@ -0,0 +1,246 @@
+package org.apache.lucene.search.suggest;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.Random;
+import java.util.concurrent.Callable;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.fst.FSTLookup;
+import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
+import org.apache.lucene.search.suggest.tst.TSTLookup;
+
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+
+/**
+ * Benchmarks tests for implementations of {@link Lookup} interface.
+ */
+@Ignore("COMMENT ME TO RUN BENCHMARKS!")
+public class LookupBenchmarkTest extends LuceneTestCase {
+ @SuppressWarnings("unchecked")
+ private final List> benchmarkClasses = Arrays.asList(
+ JaspellLookup.class,
+ TSTLookup.class,
+ FSTLookup.class);
+
+ private final static int rounds = 15;
+ private final static int warmup = 5;
+
+ private final int num = 7;
+ private final boolean onlyMorePopular = true;
+
+ private final static Random random = new Random(0xdeadbeef);
+
+ /**
+ * Input term/weight pairs.
+ */
+ private static TermFreq [] dictionaryInput;
+
+ /**
+ * Benchmark term/weight pairs (randomized order).
+ */
+ private static List benchmarkInput;
+
+ /**
+ * Loads terms and frequencies from Wikipedia (cached).
+ */
+ @BeforeClass
+ public static void setup() throws Exception {
+ List input = readTop50KWiki();
+ Collections.shuffle(input, random);
+ LookupBenchmarkTest.dictionaryInput = input.toArray(new TermFreq [input.size()]);
+ Collections.shuffle(input, random);
+ LookupBenchmarkTest.benchmarkInput = input;
+ }
+
+ static final Charset UTF_8 = Charset.forName("UTF-8");
+
+ /**
+ * Collect the multilingual input for benchmarks/ tests.
+ */
+ public static List readTop50KWiki() throws Exception {
+ List input = new ArrayList();
+ URL resource = LookupBenchmarkTest.class.getResource("Top50KWiki.utf8");
+ assert resource != null : "Resource missing: Top50KWiki.utf8";
+
+ String line = null;
+ BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(), UTF_8));
+ while ((line = br.readLine()) != null) {
+ int tab = line.indexOf('|');
+ assertTrue("No | separator?: " + line, tab >= 0);
+ float weight = Float.parseFloat(line.substring(tab + 1));
+ String key = line.substring(0, tab);
+ input.add(new TermFreq(key, weight));
+ }
+ br.close();
+ return input;
+ }
+
+ /**
+ * Test construction time.
+ */
+ public void testConstructionTime() throws Exception {
+ System.err.println("-- construction time");
+ for (final Class extends Lookup> cls : benchmarkClasses) {
+ BenchmarkResult result = measure(new Callable() {
+ public Integer call() throws Exception {
+ final Lookup lookup = buildLookup(cls, dictionaryInput);
+ return lookup.hashCode();
+ }
+ });
+
+ System.err.println(
+ String.format(Locale.ENGLISH, "%-15s input: %d, time[ms]: %s",
+ cls.getSimpleName(),
+ dictionaryInput.length,
+ result.average.toString()));
+ }
+ }
+
+ /**
+ * Test memory required for the storage.
+ */
+ public void testStorageNeeds() throws Exception {
+ System.err.println("-- RAM consumption");
+ final RamUsageEstimator rue = new RamUsageEstimator();
+ for (Class extends Lookup> cls : benchmarkClasses) {
+ Lookup lookup = buildLookup(cls, dictionaryInput);
+ System.err.println(
+ String.format(Locale.ENGLISH, "%-15s size[B]:%,13d",
+ lookup.getClass().getSimpleName(),
+ rue.estimateRamUsage(lookup)));
+ }
+ }
+
+ /**
+ * Create {@link Lookup} instance and populate it.
+ */
+ private Lookup buildLookup(Class extends Lookup> cls, TermFreq[] input) throws Exception {
+ Lookup lookup = cls.newInstance();
+ lookup.build(new TermFreqArrayIterator(input));
+ return lookup;
+ }
+
+ /**
+ * Test performance of lookup on full hits.
+ */
+ public void testPerformanceOnFullHits() throws Exception {
+ final int minPrefixLen = 100;
+ final int maxPrefixLen = 200;
+ runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
+ }
+
+ /**
+ * Test performance of lookup on longer term prefixes (6-9 letters or shorter).
+ */
+ public void testPerformanceOnPrefixes6_9() throws Exception {
+ final int minPrefixLen = 6;
+ final int maxPrefixLen = 9;
+ runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
+ }
+
+ /**
+ * Test performance of lookup on short term prefixes (2-4 letters or shorter).
+ */
+ public void testPerformanceOnPrefixes2_4() throws Exception {
+ final int minPrefixLen = 2;
+ final int maxPrefixLen = 4;
+ runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
+ }
+
+ /**
+ * Run the actual benchmark.
+ */
+ public void runPerformanceTest(final int minPrefixLen, final int maxPrefixLen,
+ final int num, final boolean onlyMorePopular) throws Exception {
+ System.err.println(String.format(Locale.ENGLISH,
+ "-- prefixes: %d-%d, num: %d, onlyMorePopular: %s",
+ minPrefixLen, maxPrefixLen, num, onlyMorePopular));
+
+ for (Class extends Lookup> cls : benchmarkClasses) {
+ final Lookup lookup = buildLookup(cls, dictionaryInput);
+
+ final List input = new ArrayList(benchmarkInput.size());
+ for (TermFreq tf : benchmarkInput) {
+ input.add(tf.term.substring(0, Math.min(tf.term.length(),
+ minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1))));
+ }
+
+ BenchmarkResult result = measure(new Callable() {
+ public Integer call() throws Exception {
+ int v = 0;
+ for (String term : input) {
+ v += lookup.lookup(term, onlyMorePopular, num).size();
+ }
+ return v;
+ }
+ });
+
+ System.err.println(
+ String.format(Locale.ENGLISH, "%-15s queries: %d, time[ms]: %s, ~qps: %.0f",
+ lookup.getClass().getSimpleName(),
+ input.size(),
+ result.average.toString(),
+ input.size() / result.average.avg));
+ }
+ }
+
+ /**
+ * Do the measurements.
+ */
+ private BenchmarkResult measure(Callable callable) {
+ final double NANOS_PER_MS = 1000000;
+
+ try {
+ List times = new ArrayList();
+ for (int i = 0; i < warmup + rounds; i++) {
+ final long start = System.nanoTime();
+ guard = callable.call().intValue();
+ times.add((System.nanoTime() - start) / NANOS_PER_MS);
+ }
+ return new BenchmarkResult(times, warmup, rounds);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /** Guard against opts. */
+ @SuppressWarnings("unused")
+ private static volatile int guard;
+
+ private static class BenchmarkResult {
+ /** Average time per round (ms). */
+ public final Average average;
+
+ public BenchmarkResult(List times, int warmup, int rounds) {
+ this.average = Average.from(times.subList(warmup, times.size()));
+ }
+ }
+}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java 2011-05-22 18:50:16.000000000 -0400
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.suggest;
+
+import java.io.File;
+
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.fst.FSTLookup;
+import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
+import org.apache.lucene.search.suggest.tst.TSTLookup;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class PersistenceTest extends LuceneTestCase {
+ public final String[] keys = new String[] {
+ "one",
+ "two",
+ "three",
+ "four",
+ "oneness",
+ "onerous",
+ "onesimus",
+ "twofold",
+ "twonk",
+ "thrive",
+ "through",
+ "threat",
+ "foundation",
+ "fourier",
+ "fourty"};
+
+ public void testTSTPersistence() throws Exception {
+ runTest(TSTLookup.class, true);
+ }
+
+ public void testJaspellPersistence() throws Exception {
+ runTest(JaspellLookup.class, true);
+ }
+
+ public void testFSTPersistence() throws Exception {
+ runTest(FSTLookup.class, false);
+ }
+
+ private void runTest(Class extends Lookup> lookupClass,
+ boolean supportsExactWeights) throws Exception {
+
+ // Add all input keys.
+ Lookup lookup = lookupClass.newInstance();
+ TermFreq[] keys = new TermFreq[this.keys.length];
+ for (int i = 0; i < keys.length; i++)
+ keys[i] = new TermFreq(this.keys[i], (float) i);
+ lookup.build(new TermFreqArrayIterator(keys));
+
+ // Store the suggester.
+ File storeDir = TEMP_DIR;
+ lookup.store(storeDir);
+
+ // Re-read it from disk.
+ lookup = lookupClass.newInstance();
+ lookup.load(storeDir);
+
+ // Assert validity.
+ float previous = Float.NEGATIVE_INFINITY;
+ for (TermFreq k : keys) {
+ Float val = (Float) lookup.get(k.term);
+ assertNotNull(k.term, val);
+
+ if (supportsExactWeights) {
+ assertEquals(k.term, Float.valueOf(k.v), val);
+ } else {
+ assertTrue(val + ">=" + previous, val >= previous);
+ previous = val.floatValue();
+ }
+ }
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java 2011-05-22 18:52:02.000000000 -0400
@@ -0,0 +1,28 @@
+package org.apache.lucene.search.suggest;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public final class TermFreq {
+ public final String term;
+ public final float v;
+
+ public TermFreq(String term, float v) {
+ this.term = term;
+ this.v = v;
+ }
+}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java 2011-05-22 18:52:09.000000000 -0400
@@ -0,0 +1,57 @@
+package org.apache.lucene.search.suggest;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+import java.util.Iterator;
+
+import org.apache.lucene.search.spell.TermFreqIterator;
+
+/**
+ * A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
+ */
+public final class TermFreqArrayIterator implements TermFreqIterator {
+ private final Iterator i;
+ private TermFreq current;
+
+ public TermFreqArrayIterator(Iterator i) {
+ this.i = i;
+ }
+
+ public TermFreqArrayIterator(TermFreq [] i) {
+ this(Arrays.asList(i));
+ }
+
+ public TermFreqArrayIterator(Iterable i) {
+ this(i.iterator());
+ }
+
+ public float freq() {
+ return current.v;
+ }
+
+ public boolean hasNext() {
+ return i.hasNext();
+ }
+
+ public String next() {
+ return (current = i.next()).term;
+ }
+
+ public void remove() { throw new UnsupportedOperationException(); }
+}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java
--- lucene-clean-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java 2011-05-22 18:52:18.000000000 -0400
@@ -0,0 +1,162 @@
+package org.apache.lucene.search.suggest.fst;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+import java.util.Random;
+
+import org.apache.lucene.search.suggest.Lookup.LookupResult;
+import org.apache.lucene.search.suggest.fst.FSTLookup;
+import org.apache.lucene.util.LuceneTestCase;
+
+import org.apache.lucene.search.suggest.LookupBenchmarkTest;
+import org.apache.lucene.search.suggest.TermFreq;
+import org.apache.lucene.search.suggest.TermFreqArrayIterator;
+
+/**
+ * Unit tests for {@link FSTLookup}.
+ */
+public class FSTLookupTest extends LuceneTestCase {
+ public static TermFreq tf(String t, float v) {
+ return new TermFreq(t, v);
+ }
+
+ private FSTLookup lookup;
+
+ public void setUp() throws Exception {
+ super.setUp();
+ final TermFreq[] keys = new TermFreq[] {
+ tf("one", 0.5f),
+ tf("oneness", 1),
+ tf("onerous", 1),
+ tf("onesimus", 1),
+ tf("two", 1),
+ tf("twofold", 1),
+ tf("twonk", 1),
+ tf("thrive", 1),
+ tf("through", 1),
+ tf("threat", 1),
+ tf("three", 1),
+ tf("foundation", 1),
+ tf("fourier", 1),
+ tf("four", 1),
+ tf("fourty", 1),
+ tf("xo", 1),
+ };
+
+ lookup = new FSTLookup();
+ lookup.build(new TermFreqArrayIterator(keys));
+ }
+
+ public void testExactMatchHighPriority() throws Exception {
+ assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
+ }
+
+ public void testExactMatchLowPriority() throws Exception {
+ assertMatchEquals(lookup.lookup("one", true, 2),
+ "one/0.0",
+ "oneness/1.0");
+ }
+
+ public void testMiss() throws Exception {
+ assertMatchEquals(lookup.lookup("xyz", true, 1));
+ }
+
+ public void testAlphabeticWithWeights() throws Exception {
+ assertEquals(0, lookup.lookup("xyz", false, 1).size());
+ }
+
+ public void testFullMatchList() throws Exception {
+ assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
+ "oneness/1.0",
+ "onerous/1.0",
+ "onesimus/1.0",
+ "one/0.0");
+ }
+
+ public void testMultilingualInput() throws Exception {
+ List input = LookupBenchmarkTest.readTop50KWiki();
+
+ lookup = new FSTLookup();
+ lookup.build(new TermFreqArrayIterator(input));
+
+ for (TermFreq tf : input) {
+ assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null);
+ assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key);
+ }
+ }
+
+ public void testEmptyInput() throws Exception {
+ lookup = new FSTLookup();
+ lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
+
+ assertMatchEquals(lookup.lookup("", true, 10));
+ }
+
+ public void testRandom() throws Exception {
+ List freqs = new ArrayList();
+ Random rnd = random;
+ for (int i = 0; i < 5000; i++) {
+ freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
+ }
+ lookup = new FSTLookup();
+ lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()])));
+
+ for (TermFreq tf : freqs) {
+ final String term = tf.term;
+ for (int i = 1; i < term.length(); i++) {
+ String prefix = term.substring(0, i);
+ for (LookupResult lr : lookup.lookup(prefix, true, 10)) {
+ assertTrue(lr.key.startsWith(prefix));
+ }
+ }
+ }
+ }
+
+ private void assertMatchEquals(List res, String... expected) {
+ String [] result = new String [res.size()];
+ for (int i = 0; i < res.size(); i++)
+ result[i] = res.get(i).toString();
+
+ if (!Arrays.equals(expected, result)) {
+ int colLen = Math.max(maxLen(expected), maxLen(result));
+
+ StringBuilder b = new StringBuilder();
+ String format = "%" + colLen + "s " + "%" + colLen + "s\n";
+ b.append(String.format(Locale.ENGLISH, format, "Expected", "Result"));
+ for (int i = 0; i < Math.max(result.length, expected.length); i++) {
+ b.append(String.format(Locale.ENGLISH, format,
+ i < expected.length ? expected[i] : "--",
+ i < result.length ? result[i] : "--"));
+ }
+
+ System.err.println(b.toString());
+ fail("Expected different output:\n" + b.toString());
+ }
+ }
+
+ private int maxLen(String[] result) {
+ int len = 0;
+ for (String s : result)
+ len = Math.max(len, s.length());
+ return len;
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/common-build.xml lucene-trunk/solr/common-build.xml
--- lucene-clean-trunk/solr/common-build.xml 2011-05-22 12:37:58.000000000 -0400
+++ lucene-trunk/solr/common-build.xml 2011-05-22 18:56:05.000000000 -0400
@@ -188,12 +188,12 @@
+
-
@@ -204,12 +204,12 @@
+
-
@@ -226,6 +226,9 @@
+
+
+
@@ -241,9 +244,6 @@
-
-
-
@@ -252,12 +252,12 @@
+
-
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java lucene-trunk/solr/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java 2011-05-22 17:34:19.000000000 -0400
@@ -26,12 +26,12 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.search.spell.HighFrequencyDictionary;
import org.apache.lucene.search.spell.PlainTextDictionary;
import org.apache.lucene.store.RAMDirectory;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
-import org.apache.solr.util.HighFrequencyDictionary;
import org.apache.solr.search.SolrIndexSearcher;
/**
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/IndexBasedSpellChecker.java lucene-trunk/solr/src/java/org/apache/solr/spelling/IndexBasedSpellChecker.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/IndexBasedSpellChecker.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/IndexBasedSpellChecker.java 2011-05-22 17:34:45.000000000 -0400
@@ -18,10 +18,11 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.search.spell.HighFrequencyDictionary;
+
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
-import org.apache.solr.util.HighFrequencyDictionary;
import java.io.File;
import java.io.IOException;
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/BufferingTermFreqIteratorWrapper.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/BufferingTermFreqIteratorWrapper.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/BufferingTermFreqIteratorWrapper.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/BufferingTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,65 +0,0 @@
-package org.apache.solr.spelling.suggest;
-
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.solr.util.TermFreqIterator;
-
-/**
- * This wrapper buffers incoming elements.
- */
-public class BufferingTermFreqIteratorWrapper implements TermFreqIterator {
-
- /** Entry in the buffer. */
- public static final class Entry implements Comparable {
- String word;
- float freq;
-
- public Entry(String word, float freq) {
- this.word = word;
- this.freq = freq;
- }
-
- public int compareTo(Entry o) {
- return word.compareTo(o.word);
- }
- }
-
- protected ArrayList entries = new ArrayList();
-
- protected int curPos;
- protected Entry curEntry;
-
- public BufferingTermFreqIteratorWrapper(TermFreqIterator source) {
- // read all source data into buffer
- while (source.hasNext()) {
- String w = source.next();
- Entry e = new Entry(w, source.freq());
- entries.add(e);
- }
- curPos = 0;
- }
-
- public float freq() {
- return curEntry.freq;
- }
-
- public boolean hasNext() {
- return curPos < entries.size();
- }
-
- public String next() {
- curEntry = entries.get(curPos);
- curPos++;
- return curEntry.word;
- }
-
- public void remove() {
- throw new UnsupportedOperationException("remove is not supported");
- }
-
- public List entries() {
- return entries;
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/FileDictionary.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/FileDictionary.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/FileDictionary.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/FileDictionary.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,95 +0,0 @@
-package org.apache.solr.spelling.suggest;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-import java.io.*;
-
-import org.apache.lucene.search.spell.Dictionary;
-import org.apache.solr.util.TermFreqIterator;
-
-
-/**
- * Dictionary represented by a text file.
- *
- * Format allowed: 1 string per line, optionally with a tab-separated integer value:
- * word1 TAB 100
- * word2 word3 TAB 101
- * word4 word5 TAB 102
- */
-public class FileDictionary implements Dictionary {
-
- private BufferedReader in;
- private String line;
- private boolean hasNextCalled;
-
- public FileDictionary(InputStream dictFile) {
- in = new BufferedReader(new InputStreamReader(dictFile));
- }
-
- /**
- * Creates a dictionary based on a reader.
- */
- public FileDictionary(Reader reader) {
- in = new BufferedReader(reader);
- }
-
- public TermFreqIterator getWordsIterator() {
- return new fileIterator();
- }
-
- final class fileIterator implements TermFreqIterator {
- private float curFreq;
-
- public String next() {
- if (!hasNextCalled) {
- hasNext();
- }
- hasNextCalled = false;
- return line;
- }
-
- public float freq() {
- return curFreq;
- }
-
- public boolean hasNext() {
- hasNextCalled = true;
- try {
- line = in.readLine();
- if (line != null) {
- String[] fields = line.split("\t");
- if (fields.length > 1) {
- curFreq = Float.parseFloat(fields[1]);
- line = fields[0];
- } else {
- curFreq = 1;
- }
- }
- } catch (IOException ex) {
- throw new RuntimeException(ex);
- }
- return (line != null) ? true : false;
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/Lookup.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/Lookup.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/Lookup.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/Lookup.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,122 +0,0 @@
-package org.apache.solr.spelling.suggest;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.lucene.search.spell.Dictionary;
-import org.apache.lucene.util.PriorityQueue;
-import org.apache.solr.common.util.NamedList;
-import org.apache.solr.core.SolrCore;
-import org.apache.solr.util.TermFreqIterator;
-
-public abstract class Lookup {
- /**
- * Result of a lookup.
- */
- public static final class LookupResult implements Comparable {
- public final String key;
- public final float value;
-
- public LookupResult(String key, float value) {
- this.key = key;
- this.value = value;
- }
-
- @Override
- public String toString() {
- return key + "/" + value;
- }
-
- /** Compare alphabetically. */
- public int compareTo(LookupResult o) {
- return this.key.compareTo(o.key);
- }
- }
-
- public static final class LookupPriorityQueue extends PriorityQueue {
-
- public LookupPriorityQueue(int size) {
- super(size);
- }
-
- @Override
- protected boolean lessThan(LookupResult a, LookupResult b) {
- return a.value < b.value;
- }
-
- public LookupResult[] getResults() {
- int size = size();
- LookupResult[] res = new LookupResult[size];
- for (int i = size - 1; i >= 0; i--) {
- res[i] = pop();
- }
- return res;
- }
- }
-
- /** Initialize the lookup. */
- public abstract void init(NamedList config, SolrCore core);
-
- /** Build lookup from a dictionary. Some implementations may require sorted
- * or unsorted keys from the dictionary's iterator - use
- * {@link SortedTermFreqIteratorWrapper} or
- * {@link UnsortedTermFreqIteratorWrapper} in such case.
- */
- public void build(Dictionary dict) throws IOException {
- Iterator it = dict.getWordsIterator();
- TermFreqIterator tfit;
- if (it instanceof TermFreqIterator) {
- tfit = (TermFreqIterator)it;
- } else {
- tfit = new TermFreqIterator.TermFreqIteratorWrapper(it);
- }
- build(tfit);
- }
-
- protected abstract void build(TermFreqIterator tfit) throws IOException;
-
- /**
- * Persist the constructed lookup data to a directory. Optional operation.
- * @param storeDir directory where data can be stored.
- * @return true if successful, false if unsuccessful or not supported.
- * @throws IOException when fatal IO error occurs.
- */
- public abstract boolean store(File storeDir) throws IOException;
-
- /**
- * Discard current lookup data and load it from a previously saved copy.
- * Optional operation.
- * @param storeDir directory where lookup data was stored.
- * @return true if completed successfully, false if unsuccessful or not supported.
- * @throws IOException when fatal IO error occurs.
- */
- public abstract boolean load(File storeDir) throws IOException;
-
- /**
- * Look up a key and return possible completion for this key.
- * @param key lookup key. Depending on the implementation this may be
- * a prefix, misspelling, or even infix.
- * @param onlyMorePopular return only more popular results
- * @param num maximum number of results to return
- * @return a list of possible completions, with their relative weight (e.g. popularity)
- */
- public abstract List lookup(String key, boolean onlyMorePopular, int num);
-
- /**
- * Modify the lookup data by recording additional data. Optional operation.
- * @param key new lookup key
- * @param value value to associate with this key
- * @return true if new key is added, false if it already exists or operation
- * is not supported.
- */
- public abstract boolean add(String key, Object value);
-
- /**
- * Get value associated with a specific key.
- * @param key lookup key
- * @return associated value
- */
- public abstract Object get(String key);
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/LookupFactory.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/LookupFactory.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/LookupFactory.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/LookupFactory.java 2011-05-22 17:53:03.000000000 -0400
@@ -0,0 +1,29 @@
+package org.apache.solr.spelling.suggest;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrCore;
+
+/**
+ * Suggester factory for creating {@link Lookup} instances.
+ */
+public abstract class LookupFactory {
+ public abstract Lookup create(NamedList params, SolrCore core);
+}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/SortedTermFreqIteratorWrapper.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/SortedTermFreqIteratorWrapper.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/SortedTermFreqIteratorWrapper.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/SortedTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,18 +0,0 @@
-package org.apache.solr.spelling.suggest;
-
-import java.util.Collections;
-
-import org.apache.solr.util.SortedIterator;
-import org.apache.solr.util.TermFreqIterator;
-
-/**
- * This wrapper buffers incoming elements and makes sure they are sorted in
- * ascending lexicographic order.
- */
-public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper implements SortedIterator {
-
- public SortedTermFreqIteratorWrapper(TermFreqIterator source) {
- super(source);
- Collections.sort(entries);
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/Suggester.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/Suggester.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/Suggester.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/Suggester.java 2011-05-22 19:16:54.000000000 -0400
@@ -27,15 +27,20 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.spell.Dictionary;
+import org.apache.lucene.search.spell.HighFrequencyDictionary;
+import org.apache.lucene.search.suggest.FileDictionary;
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.Lookup.LookupResult;
+
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.spelling.SolrSpellChecker;
import org.apache.solr.spelling.SpellingOptions;
import org.apache.solr.spelling.SpellingResult;
-import org.apache.solr.spelling.suggest.Lookup.LookupResult;
-import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
-import org.apache.solr.util.HighFrequencyDictionary;
+import org.apache.solr.spelling.suggest.fst.FSTLookupFactory;
+import org.apache.solr.spelling.suggest.jaspell.JaspellLookupFactory;
+import org.apache.solr.spelling.suggest.tst.TSTLookupFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -80,11 +85,18 @@
sourceLocation = (String) config.get(LOCATION);
field = (String)config.get(FIELD);
lookupImpl = (String)config.get(LOOKUP_IMPL);
- if (lookupImpl == null) {
- lookupImpl = JaspellLookup.class.getName();
+
+ // support the old classnames without -Factory for config file backwards compatibility.
+ if (lookupImpl == null || "org.apache.solr.spelling.suggest.jaspell.JaspellLookup".equals(lookupImpl)) {
+ lookupImpl = JaspellLookupFactory.class.getName();
+ } else if ("org.apache.solr.spelling.suggest.tst.TSTLookup".equals(lookupImpl)) {
+ lookupImpl = TSTLookupFactory.class.getName();
+ } else if ("org.apache.solr.spelling.suggest.fst.FSTLookup".equals(lookupImpl)) {
+ lookupImpl = FSTLookupFactory.class.getName();
}
- lookup = (Lookup) core.getResourceLoader().newInstance(lookupImpl);
- lookup.init(config, core);
+
+ LookupFactory factory = (LookupFactory) core.getResourceLoader().newInstance(lookupImpl);
+ lookup = factory.create(config, core);
String store = (String)config.get(STORE_DIR);
if (store != null) {
storeDir = new File(store);
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/UnsortedTermFreqIteratorWrapper.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/UnsortedTermFreqIteratorWrapper.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/UnsortedTermFreqIteratorWrapper.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/UnsortedTermFreqIteratorWrapper.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,17 +0,0 @@
-package org.apache.solr.spelling.suggest;
-
-import java.util.Collections;
-
-import org.apache.solr.util.TermFreqIterator;
-
-/**
- * This wrapper buffers the incoming elements and makes sure they are in
- * random order.
- */
-public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
-
- public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) {
- super(source);
- Collections.shuffle(entries);
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookup.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookup.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookup.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookup.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,556 +0,0 @@
-package org.apache.solr.spelling.suggest.fst;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-
-import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.automaton.fst.Builder;
-import org.apache.lucene.util.automaton.fst.FST;
-import org.apache.lucene.util.automaton.fst.FST.Arc;
-import org.apache.lucene.util.automaton.fst.NoOutputs;
-import org.apache.lucene.util.automaton.fst.Outputs;
-import org.apache.solr.common.util.NamedList;
-import org.apache.solr.core.SolrCore;
-import org.apache.solr.spelling.suggest.Lookup;
-import org.apache.solr.spelling.suggest.tst.TSTLookup;
-import org.apache.solr.util.TermFreqIterator;
-
-import com.google.common.collect.Lists;
-import com.google.common.io.Closeables;
-
-/**
- * Finite state automata based implementation of {@link Lookup} query
- * suggestion/ autocomplete interface.
- *
- *
Implementation details
- *
- *
The construction step in {@link #build(TermFreqIterator)} works as follows:
- *
- *
A set of input terms (String) and weights (float) is given.
- *
The range of weights is determined and then all weights are discretized into a fixed set
- * of values ({@link #buckets}).
- * Note that this means that minor changes in weights may be lost during automaton construction.
- * In general, this is not a big problem because the "priorities" of completions can be split
- * into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
- * If you need exact, fine-grained weights, use {@link TSTLookup} instead.
- *
All terms in the input are preprended with a synthetic pseudo-character being the weight
- * of that term. For example a term abc with a discretized weight equal '1' would
- * become 1abc.
- *
The terms are sorted by their raw value of utf16 character values (including the synthetic
- * term in front).
- *
A finite state automaton ({@link FST}) is constructed from the input. The root node has
- * arcs labeled with all possible weights. We cache all these arcs, highest-weight first.
- *
- *
- *
At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
- *
- *
For each possible term weight encoded in the automaton (cached arcs from the root above),
- * starting with the highest one, we descend along the path of the input key. If the key is not
- * a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
- * No completions.
- *
Otherwise, we have found an internal automaton node that ends the key. The entire
- * subautomaton (all paths) starting from this node form the key's completions. We start
- * the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
- * suggestion to the list of results (the weight of this suggestion is constant and equal to the
- * root path we started from). The tricky part is that because automaton edges are sorted and
- * we scan depth-first, we can terminate the entire procedure as soon as we collect enough
- * suggestions the user requested.
- *
In case the number of suggestions collected in the step above is still insufficient,
- * we proceed to the next (smaller) weight leaving the root node and repeat the same
- * algorithm again.
- *
- *
- *
- *
Runtime behavior and performance characteristic
- *
- *
The algorithm described above is optimized for finding suggestions to short prefixes
- * in a top-weights-first order. This is probably the most common use case: it allows
- * presenting suggestions early and sorts them by the global frequency (and then alphabetically).
- *
- *
If there is an exact match in the automaton, it is returned first on the results
- * list (even with by-weight sorting).
- *
- *
Note that the maximum lookup time for any prefix
- * is the time of descending to the subtree, plus traversal of the subtree up to the number
- * of requested suggestions (because they are already presorted by weight on the root level
- * and alphabetically at any node level).
- *
- *
To order alphabetically only (no ordering by priorities), use identical term weights
- * for all terms. Alphabetical suggestions are returned even if non-constant weights are
- * used, but the algorithm for doing this is suboptimal.
- *
- *
"alphabetically" in any of the documentation above indicates utf16 codepoint order,
- * nothing else.
- */
-public class FSTLookup extends Lookup {
- /** A structure for a single entry (for sorting/ preprocessing). */
- private static class Entry {
- char [] term;
- float weight;
-
- public Entry(char [] term, float freq) {
- this.term = term;
- this.weight = freq;
- }
- }
-
- /**
- * The number of separate buckets for weights (discretization). The more buckets,
- * the more fine-grained term weights (priorities) can be assigned. The speed of lookup
- * will not decrease for prefixes which have highly-weighted completions (because these
- * are filled-in first), but will decrease significantly for low-weighted terms (but
- * these should be infrequent, so it is all right).
- *
- *
The number of buckets must be within [1, 255] range.
- */
- public static final String WEIGHT_BUCKETS = "weightBuckets";
-
- /**
- * If true, exact suggestions are returned first, even if they are prefixes
- * of other strings in the automaton (possibly with larger weights).
- */
- public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
-
- /** Serialized automaton file name (storage). */
- public static final String FILENAME = "fst.dat";
-
- /** An empty result. */
- private static final List EMPTY_RESULT = Lists.newArrayList();
-
- /**
- * @see #WEIGHT_BUCKETS
- */
- private int buckets = 10;
-
- /**
- * #see #EXACT_MATCH_FIRST
- */
- private boolean exactMatchFirst = true;
-
- /**
- * Finite state automaton encoding all the lookup terms. See class
- * notes for details.
- */
- private FST automaton;
-
- /**
- * An array of arcs leaving the root automaton state and encoding weights of all
- * completions in their sub-trees.
- */
- private Arc [] rootArcs;
-
- /* */
- @Override
- @SuppressWarnings("rawtypes")
- public void init(NamedList config, SolrCore core) {
- this.buckets = config.get(WEIGHT_BUCKETS) != null
- ? Integer.parseInt(config.get(WEIGHT_BUCKETS).toString())
- : 10;
-
- this.exactMatchFirst = config.get(EXACT_MATCH_FIRST) != null
- ? Boolean.valueOf(config.get(EXACT_MATCH_FIRST).toString())
- : true;
- }
-
- /* */
- @Override
- public void build(TermFreqIterator tfit) throws IOException {
- // Buffer the input because we will need it twice: for calculating
- // weights distribution and for the actual automata building.
- List entries = Lists.newArrayList();
- while (tfit.hasNext()) {
- String term = tfit.next();
- char [] termChars = new char [term.length() + 1]; // add padding for weight.
- for (int i = 0; i < term.length(); i++)
- termChars[i + 1] = term.charAt(i);
- entries.add(new Entry(termChars, tfit.freq()));
- }
-
- // Distribute weights into at most N buckets. This is a form of discretization to
- // limit the number of possible weights so that they can be efficiently encoded in the
- // automaton.
- //
- // It is assumed the distribution of weights is _linear_ so proportional division
- // of [min, max] range will be enough here. Other approaches could be to sort
- // weights and divide into proportional ranges.
- if (entries.size() > 0) {
- redistributeWeightsProportionalMinMax(entries, buckets);
- encodeWeightPrefix(entries);
- }
-
- // Build the automaton (includes input sorting) and cache root arcs in order from the highest,
- // to the lowest weight.
- this.automaton = buildAutomaton(entries);
- cacheRootArcs();
- }
-
- /**
- * Cache the root node's output arcs starting with completions with the highest weights.
- */
- @SuppressWarnings("unchecked")
- private void cacheRootArcs() throws IOException {
- if (automaton != null) {
- List> rootArcs = Lists.newArrayList();
- Arc arc = automaton.getFirstArc(new Arc());
- automaton.readFirstTargetArc(arc, arc);
- while (true) {
- rootArcs.add(new Arc().copyFrom(arc));
- if (arc.isLast())
- break;
- automaton.readNextArc(arc);
- }
-
- Collections.reverse(rootArcs); // we want highest weights first.
- this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]);
- }
- }
-
- /**
- * Not implemented.
- */
- @Override
- public boolean add(String key, Object value) {
- // This implementation does not support ad-hoc additions (all input
- // must be sorted for the builder).
- return false;
- }
-
- /**
- * Get the (approximated) weight of a single key (if there is a perfect match
- * for it in the automaton).
- *
- * @return Returns the approximated weight of the input key or null
- * if not found.
- */
- @Override
- public Float get(String key) {
- return getExactMatchStartingFromRootArc(0, key);
- }
-
- /**
- * Returns the first exact match by traversing root arcs, starting from
- * the arc i.
- *
- * @param i The first root arc index in {@link #rootArcs} to consider when
- * matching.
- */
- private Float getExactMatchStartingFromRootArc(int i, String key) {
- // Get the UTF-8 bytes representation of the input key.
- try {
- final FST.Arc scratch = new FST.Arc();
- for (; i < rootArcs.length; i++) {
- final FST.Arc rootArc = rootArcs[i];
- final FST.Arc arc = scratch.copyFrom(rootArc);
-
- // Descend into the automaton using the key as prefix.
- if (descendWithPrefix(arc, key)) {
- automaton.readFirstTargetArc(arc, arc);
- if (arc.label == FST.END_LABEL) {
- // Prefix-encoded weight.
- return rootArc.label / (float) buckets;
- }
- }
- }
- } catch (IOException e) {
- // Should never happen, but anyway.
- throw new RuntimeException(e);
- }
-
- return null;
- }
-
- /**
- * Lookup autocomplete suggestions to key.
- *
- * @param key The prefix to which suggestions should be sought.
- * @param onlyMorePopular Return most popular suggestions first. This is the default
- * behavior for this implementation. Setting it to false has no effect (use
- * constant term weights to sort alphabetically only).
- * @param num At most this number of suggestions will be returned.
- * @return Returns the suggestions, sorted by their approximated weight first (decreasing)
- * and then alphabetically (utf16 codepoint order).
- */
- @Override
- public List lookup(String key, boolean onlyMorePopular, int num) {
- if (key.length() == 0 || automaton == null) {
- // Keep the result an ArrayList to keep calls monomorphic.
- return EMPTY_RESULT;
- }
-
- try {
- if (!onlyMorePopular && rootArcs.length > 1) {
- // We could emit a warning here (?). An optimal strategy for alphabetically sorted
- // suggestions would be to add them with a constant weight -- this saves unnecessary
- // traversals and sorting.
- return lookupSortedAlphabetically(key, num);
- } else {
- return lookupSortedByWeight(key, num, true);
- }
- } catch (IOException e) {
- // Should never happen, but anyway.
- throw new RuntimeException(e);
- }
- }
-
- /**
- * Lookup suggestions sorted alphabetically if weights are not constant. This
- * is a workaround: in general, use constant weights for alphabetically sorted result.
- */
- private List lookupSortedAlphabetically(String key, int num) throws IOException {
- // Greedily get num results from each weight branch.
- List res = lookupSortedByWeight(key, num, false);
-
- // Sort and trim.
- Collections.sort(res, new Comparator() {
- @Override
- public int compare(LookupResult o1, LookupResult o2) {
- return o1.key.compareTo(o2.key);
- }
- });
- if (res.size() > num) {
- res = res.subList(0, num);
- }
- return res;
- }
-
- /**
- * Lookup suggestions sorted by weight (descending order).
- *
- * @param greedy If true, the routine terminates immediately when num
- * suggestions have been collected. If false, it will collect suggestions from
- * all weight arcs (needed for {@link #lookupSortedAlphabetically}.
- */
- private ArrayList lookupSortedByWeight(String key, int num, boolean greedy) throws IOException {
- final ArrayList res = new ArrayList(Math.min(10, num));
- final StringBuilder output = new StringBuilder(key);
- final int matchLength = key.length() - 1;
-
- for (int i = 0; i < rootArcs.length; i++) {
- final FST.Arc rootArc = rootArcs[i];
- final FST.Arc arc = new FST.Arc().copyFrom(rootArc);
-
- // Descend into the automaton using the key as prefix.
- if (descendWithPrefix(arc, key)) {
- // Prefix-encoded weight.
- final float weight = rootArc.label / (float) buckets;
-
- // A subgraph starting from the current node has the completions
- // of the key prefix. The arc we're at is the last key's byte,
- // so we will collect it too.
- output.setLength(matchLength);
- if (collect(res, num, weight, output, arc) && greedy) {
- // We have enough suggestion to return immediately. Keep on looking for an
- // exact match, if requested.
- if (exactMatchFirst) {
- Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key);
- if (exactMatchWeight != null) {
- res.add(0, new LookupResult(key, exactMatchWeight));
- while (res.size() > num) {
- res.remove(res.size() - 1);
- }
- }
- }
- break;
- }
- }
- }
- return res;
- }
-
- /**
- * Descend along the path starting at arc and going through
- * bytes in utf8 argument.
- *
- * @param arc The starting arc. This argument is modified in-place.
- * @param term The term to descend with.
- * @return If true, arc will be set to the arc matching
- * last byte of utf8. false is returned if no such
- * prefix utf8 exists.
- */
- private boolean descendWithPrefix(Arc arc, String term) throws IOException {
- final int max = term.length();
-
- for (int i = 0; i < max; i++) {
- if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) {
- // No matching prefixes, return an empty result.
- return false;
- }
- }
-
- return true;
- }
-
- /**
- * Recursive collect lookup results from the automaton subgraph starting at arc.
- *
- * @param num Maximum number of results needed (early termination).
- * @param weight Weight of all results found during this collection.
- */
- private boolean collect(List res, int num, float weight, StringBuilder output, Arc arc) throws IOException {
- output.append((char) arc.label);
-
- automaton.readFirstTargetArc(arc, arc);
- while (true) {
- if (arc.label == FST.END_LABEL) {
- res.add(new LookupResult(output.toString(), weight));
- if (res.size() >= num)
- return true;
- } else {
- int save = output.length();
- if (collect(res, num, weight, output, new Arc().copyFrom(arc))) {
- return true;
- }
- output.setLength(save);
- }
-
- if (arc.isLast()) {
- break;
- }
- automaton.readNextArc(arc);
- }
- return false;
- }
-
- /**
- * Builds the final automaton from a list of entries.
- */
- private FST buildAutomaton(List entries) throws IOException {
- if (entries.size() == 0)
- return null;
-
- // Sort by utf16 (raw char value)
- final Comparator comp = new Comparator() {
- public int compare(Entry o1, Entry o2) {
- char [] ch1 = o1.term;
- char [] ch2 = o2.term;
- int len1 = ch1.length;
- int len2 = ch2.length;
-
- int max = Math.min(len1, len2);
- for (int i = 0; i < max; i++) {
- int v = ch1[i] - ch2[i];
- if (v != 0) return v;
- }
- return len1 - len2;
- }
- };
- Collections.sort(entries, comp);
-
- // Avoid duplicated identical entries, if possible. This is required because
- // it breaks automaton construction otherwise.
- int len = entries.size();
- int j = 0;
- for (int i = 1; i < len; i++) {
- if (comp.compare(entries.get(j), entries.get(i)) != 0) {
- entries.set(++j, entries.get(i));
- }
- }
- entries = entries.subList(0, j + 1);
-
- // Build the automaton.
- final Outputs outputs = NoOutputs.getSingleton();
- final Object empty = outputs.getNoOutput();
- final Builder builder =
- new Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
- final IntsRef scratchIntsRef = new IntsRef(10);
- for (Entry e : entries) {
- final int termLength = scratchIntsRef.length = e.term.length;
-
- scratchIntsRef.grow(termLength);
- final int [] ints = scratchIntsRef.ints;
- final char [] chars = e.term;
- for (int i = termLength; --i >= 0;) {
- ints[i] = chars[i];
- }
- builder.add(scratchIntsRef, empty);
- }
- return builder.finish();
- }
-
- /**
- * Prepends the entry's weight to each entry, encoded as a single byte, so that the
- * root automaton node fans out to all possible priorities, starting with the arc that has
- * the highest weights.
- */
- private void encodeWeightPrefix(List entries) {
- for (Entry e : entries) {
- int weight = (int) e.weight;
- assert (weight >= 0 && weight <= buckets) :
- "Weight out of range: " + weight + " [" + buckets + "]";
-
- // There should be a single empty char reserved in front for the weight.
- e.term[0] = (char) weight;
- }
- }
-
- /**
- * Split [min, max] range into buckets, reassigning weights. Entries' weights are
- * remapped to [0, buckets] range (so, buckets + 1 buckets, actually).
- */
- private void redistributeWeightsProportionalMinMax(List entries, int buckets) {
- float min = entries.get(0).weight;
- float max = min;
- for (Entry e : entries) {
- min = Math.min(e.weight, min);
- max = Math.max(e.weight, max);
- }
-
- final float range = max - min;
- for (Entry e : entries) {
- e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor()
- }
- }
-
- /**
- * Deserialization from disk.
- */
- @Override
- public synchronized boolean load(File storeDir) throws IOException {
- File data = new File(storeDir, FILENAME);
- if (!data.exists() || !data.canRead()) {
- return false;
- }
-
- InputStream is = new BufferedInputStream(new FileInputStream(data));
- try {
- this.automaton = new FST(new InputStreamDataInput(is), NoOutputs.getSingleton());
- cacheRootArcs();
- } finally {
- Closeables.closeQuietly(is);
- }
- return true;
- }
-
- /**
- * Serialization to disk.
- */
- @Override
- public synchronized boolean store(File storeDir) throws IOException {
- if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
- return false;
- }
-
- if (this.automaton == null)
- return false;
-
- File data = new File(storeDir, FILENAME);
- OutputStream os = new BufferedOutputStream(new FileOutputStream(data));
- try {
- this.automaton.save(new OutputStreamDataOutput(os));
- } finally {
- Closeables.closeQuietly(os);
- }
-
- return true;
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java 2011-05-22 18:12:43.000000000 -0400
@@ -0,0 +1,60 @@
+package org.apache.solr.spelling.suggest.fst;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.fst.FSTLookup;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.spelling.suggest.LookupFactory;
+
+/**
+ * Factory for {@link FSTLookup}
+ */
+public class FSTLookupFactory extends LookupFactory {
+
+ /**
+ * The number of separate buckets for weights (discretization). The more buckets,
+ * the more fine-grained term weights (priorities) can be assigned. The speed of lookup
+ * will not decrease for prefixes which have highly-weighted completions (because these
+ * are filled-in first), but will decrease significantly for low-weighted terms (but
+ * these should be infrequent, so it is all right).
+ *
+ *
The number of buckets must be within [1, 255] range.
+ */
+ public static final String WEIGHT_BUCKETS = "weightBuckets";
+
+ /**
+ * If true, exact suggestions are returned first, even if they are prefixes
+ * of other strings in the automaton (possibly with larger weights).
+ */
+ public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
+
+ @Override
+ public Lookup create(NamedList params, SolrCore core) {
+ int buckets = params.get(WEIGHT_BUCKETS) != null
+ ? Integer.parseInt(params.get(WEIGHT_BUCKETS).toString())
+ : 10;
+
+ boolean exactMatchFirst = params.get(EXACT_MATCH_FIRST) != null
+ ? Boolean.valueOf(params.get(EXACT_MATCH_FIRST).toString())
+ : true;
+
+ return new FSTLookup(buckets, exactMatchFirst);
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/InputStreamDataInput.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/InputStreamDataInput.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/InputStreamDataInput.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/InputStreamDataInput.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,31 +0,0 @@
-package org.apache.solr.spelling.suggest.fst;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import org.apache.lucene.store.DataInput;
-import com.google.common.io.ByteStreams;
-
-/**
- * A {@link DataInput} wrapping a plain {@link InputStream}.
- */
-public class InputStreamDataInput extends DataInput {
-
- private final InputStream is;
-
- public InputStreamDataInput(InputStream is) {
- this.is = is;
- }
-
- @Override
- public byte readByte() throws IOException {
- int v = is.read();
- if (v == -1) throw new EOFException();
- return (byte) v;
- }
-
- @Override
- public void readBytes(byte[] b, int offset, int len) throws IOException {
- ByteStreams.readFully(is, b, offset, len);
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/OutputStreamDataOutput.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/OutputStreamDataOutput.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/OutputStreamDataOutput.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/fst/OutputStreamDataOutput.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,28 +0,0 @@
-package org.apache.solr.spelling.suggest.fst;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-import org.apache.lucene.store.DataOutput;
-
-/**
- * A {@link DataOutput} wrapping a plain {@link OutputStream}.
- */
-public class OutputStreamDataOutput extends DataOutput {
-
- private final OutputStream os;
-
- public OutputStreamDataOutput(OutputStream os) {
- this.os = os;
- }
-
- @Override
- public void writeByte(byte b) throws IOException {
- os.write(b);
- }
-
- @Override
- public void writeBytes(byte[] b, int offset, int length) throws IOException {
- os.write(b, offset, length);
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookup.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookup.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookup.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookup.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,182 +0,0 @@
-package org.apache.solr.spelling.suggest.jaspell;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.solr.common.util.NamedList;
-import org.apache.solr.core.SolrCore;
-import org.apache.solr.spelling.suggest.Lookup;
-import org.apache.solr.spelling.suggest.UnsortedTermFreqIteratorWrapper;
-import org.apache.solr.spelling.suggest.jaspell.JaspellTernarySearchTrie.TSTNode;
-import org.apache.solr.util.SortedIterator;
-import org.apache.solr.util.TermFreqIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class JaspellLookup extends Lookup {
- private static final Logger LOG = LoggerFactory.getLogger(JaspellLookup.class);
- JaspellTernarySearchTrie trie = new JaspellTernarySearchTrie();
- private boolean usePrefix = true;
- private int editDistance = 2;
-
- @Override
- public void init(NamedList config, SolrCore core) {
- LOG.info("init: " + config);
- }
-
- @Override
- public void build(TermFreqIterator tfit) throws IOException {
- if (tfit instanceof SortedIterator) {
- // make sure it's unsorted
- tfit = new UnsortedTermFreqIteratorWrapper(tfit);
- }
- trie = new JaspellTernarySearchTrie();
- trie.setMatchAlmostDiff(editDistance);
- while (tfit.hasNext()) {
- String key = tfit.next();
- float freq = tfit.freq();
- if (key.length() == 0) {
- continue;
- }
- trie.put(key, new Float(freq));
- }
- }
-
- @Override
- public boolean add(String key, Object value) {
- trie.put(key, value);
- // XXX
- return false;
- }
-
- @Override
- public Object get(String key) {
- return trie.get(key);
- }
-
- @Override
- public List lookup(String key, boolean onlyMorePopular, int num) {
- List res = new ArrayList();
- List list;
- int count = onlyMorePopular ? num * 2 : num;
- if (usePrefix) {
- list = trie.matchPrefix(key, count);
- } else {
- list = trie.matchAlmost(key, count);
- }
- if (list == null || list.size() == 0) {
- return res;
-
- }
- int maxCnt = Math.min(num, list.size());
- if (onlyMorePopular) {
- LookupPriorityQueue queue = new LookupPriorityQueue(num);
- for (String s : list) {
- float freq = (Float)trie.get(s);
- queue.insertWithOverflow(new LookupResult(s, freq));
- }
- for (LookupResult lr : queue.getResults()) {
- res.add(lr);
- }
- } else {
- for (int i = 0; i < maxCnt; i++) {
- String s = list.get(i);
- float freq = (Float)trie.get(s);
- res.add(new LookupResult(s, freq));
- }
- }
- return res;
- }
-
- public static final String FILENAME = "jaspell.dat";
- private static final byte LO_KID = 0x01;
- private static final byte EQ_KID = 0x02;
- private static final byte HI_KID = 0x04;
- private static final byte HAS_VALUE = 0x08;
-
-
- @Override
- public boolean load(File storeDir) throws IOException {
- File data = new File(storeDir, FILENAME);
- if (!data.exists() || !data.canRead()) {
- return false;
- }
- DataInputStream in = new DataInputStream(new FileInputStream(data));
- TSTNode root = trie.new TSTNode('\0', null);
- try {
- readRecursively(in, root);
- trie.setRoot(root);
- } finally {
- in.close();
- }
- return true;
- }
-
- private void readRecursively(DataInputStream in, TSTNode node) throws IOException {
- node.splitchar = in.readChar();
- byte mask = in.readByte();
- if ((mask & HAS_VALUE) != 0) {
- node.data = new Float(in.readFloat());
- }
- if ((mask & LO_KID) != 0) {
- TSTNode kid = trie.new TSTNode('\0', node);
- node.relatives[TSTNode.LOKID] = kid;
- readRecursively(in, kid);
- }
- if ((mask & EQ_KID) != 0) {
- TSTNode kid = trie.new TSTNode('\0', node);
- node.relatives[TSTNode.EQKID] = kid;
- readRecursively(in, kid);
- }
- if ((mask & HI_KID) != 0) {
- TSTNode kid = trie.new TSTNode('\0', node);
- node.relatives[TSTNode.HIKID] = kid;
- readRecursively(in, kid);
- }
- }
-
- @Override
- public boolean store(File storeDir) throws IOException {
- if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
- return false;
- }
- TSTNode root = trie.getRoot();
- if (root == null) { // empty tree
- return false;
- }
- File data = new File(storeDir, FILENAME);
- DataOutputStream out = new DataOutputStream(new FileOutputStream(data));
- try {
- writeRecursively(out, root);
- out.flush();
- } finally {
- out.close();
- }
- return true;
- }
-
- private void writeRecursively(DataOutputStream out, TSTNode node) throws IOException {
- if (node == null) {
- return;
- }
- out.writeChar(node.splitchar);
- byte mask = 0;
- if (node.relatives[TSTNode.LOKID] != null) mask |= LO_KID;
- if (node.relatives[TSTNode.EQKID] != null) mask |= EQ_KID;
- if (node.relatives[TSTNode.HIKID] != null) mask |= HI_KID;
- if (node.data != null) mask |= HAS_VALUE;
- out.writeByte(mask);
- if (node.data != null) {
- out.writeFloat((Float)node.data);
- }
- writeRecursively(out, node.relatives[TSTNode.LOKID]);
- writeRecursively(out, node.relatives[TSTNode.EQKID]);
- writeRecursively(out, node.relatives[TSTNode.HIKID]);
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookupFactory.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookupFactory.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookupFactory.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellLookupFactory.java 2011-05-22 18:07:01.000000000 -0400
@@ -0,0 +1,39 @@
+package org.apache.solr.spelling.suggest.jaspell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.spelling.suggest.LookupFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Factory for {@link JaspellLookup}
+ */
+public class JaspellLookupFactory extends LookupFactory {
+ private static final Logger LOG = LoggerFactory.getLogger(JaspellLookup.class);
+
+ @Override
+ public Lookup create(NamedList params, SolrCore core) {
+ LOG.info("init: " + params);
+ return new JaspellLookup();
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellTernarySearchTrie.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellTernarySearchTrie.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellTernarySearchTrie.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/jaspell/JaspellTernarySearchTrie.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,866 +0,0 @@
-package org.apache.solr.spelling.suggest.jaspell;
-
-/**
- * Copyright (c) 2005 Bruno Martins
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the organization nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.List;
-import java.util.Vector;
-import java.util.zip.GZIPInputStream;
-
-/**
- * Implementation of a Ternary Search Trie, a data structure for storing
- * String objects that combines the compact size of a binary search
- * tree with the speed of a digital search trie, and is therefore ideal for
- * practical use in sorting and searching data.
- *
- *
- * This data structure is faster than hashing for many typical search problems,
- * and supports a broader range of useful problems and operations. Ternary
- * searches are faster than hashing and more powerful, too.
- *
- *
- *
- * The theory of ternary search trees was described at a symposium in 1997 (see
- * "Fast Algorithms for Sorting and Searching Strings," by J.L. Bentley and R.
- * Sedgewick, Proceedings of the 8th Annual ACM-SIAM Symposium on Discrete
- * Algorithms, January 1997). Algorithms in C, Third Edition, by Robert
- * Sedgewick (Addison-Wesley, 1998) provides yet another view of ternary search
- * trees.
- *
- * @author Bruno Martins
- *
- */
-public class JaspellTernarySearchTrie {
-
- /**
- * An inner class of Ternary Search Trie that represents a node in the trie.
- */
- protected final class TSTNode {
-
- /** Index values for accessing relatives array. */
- protected final static int PARENT = 0, LOKID = 1, EQKID = 2, HIKID = 3;
-
- /** The key to the node. */
- protected Object data;
-
- /** The relative nodes. */
- protected TSTNode[] relatives = new TSTNode[4];
-
- /** The char used in the split. */
- protected char splitchar;
-
- /**
- * Constructor method.
- *
- *@param splitchar
- * The char used in the split.
- *@param parent
- * The parent node.
- */
- protected TSTNode(char splitchar, TSTNode parent) {
- this.splitchar = splitchar;
- relatives[PARENT] = parent;
- }
- }
-
- /**
- * Compares characters by alfabetical order.
- *
- *@param cCompare2
- * The first char in the comparison.
- *@param cRef
- * The second char in the comparison.
- *@return A negative number, 0 or a positive number if the second char is
- * less, equal or greater.
- */
- private static int compareCharsAlphabetically(char cCompare2, char cRef) {
- return Character.toLowerCase(cCompare2) - Character.toLowerCase(cRef);
- }
-
- /* what follows is the original Jaspell code.
- private static int compareCharsAlphabetically(int cCompare2, int cRef) {
- int cCompare = 0;
- if (cCompare2 >= 65) {
- if (cCompare2 < 89) {
- cCompare = (2 * cCompare2) - 65;
- } else if (cCompare2 < 97) {
- cCompare = cCompare2 + 24;
- } else if (cCompare2 < 121) {
- cCompare = (2 * cCompare2) - 128;
- } else cCompare = cCompare2;
- } else cCompare = cCompare2;
- if (cRef < 65) {
- return cCompare - cRef;
- }
- if (cRef < 89) {
- return cCompare - ((2 * cRef) - 65);
- }
- if (cRef < 97) {
- return cCompare - (cRef + 24);
- }
- if (cRef < 121) {
- return cCompare - ((2 * cRef) - 128);
- }
- return cCompare - cRef;
- }
- */
-
- /**
- * The default number of values returned by the matchAlmost
- * method.
- */
- private int defaultNumReturnValues = -1;
-
- /**
- * the number of differences allowed in a call to the
- * matchAlmostKey method.
- */
- private int matchAlmostDiff;
-
- /** The base node in the trie. */
- private TSTNode rootNode;
-
- /**
- * Constructs an empty Ternary Search Trie.
- */
- public JaspellTernarySearchTrie() {
- }
-
- // for loading
- void setRoot(TSTNode newRoot) {
- rootNode = newRoot;
- }
-
- // for saving
- TSTNode getRoot() {
- return rootNode;
- }
-
- /**
- * Constructs a Ternary Search Trie and loads data from a File
- * into the Trie. The file is a normal text document, where each line is of
- * the form word TAB float.
- *
- *@param file
- * The File with the data to load into the Trie.
- *@exception IOException
- * A problem occured while reading the data.
- */
- public JaspellTernarySearchTrie(File file) throws IOException {
- this(file, false);
- }
-
- /**
- * Constructs a Ternary Search Trie and loads data from a File
- * into the Trie. The file is a normal text document, where each line is of
- * the form "word TAB float".
- *
- *@param file
- * The File with the data to load into the Trie.
- *@param compression
- * If true, the file is compressed with the GZIP algorithm, and if
- * false, the file is a normal text document.
- *@exception IOException
- * A problem occured while reading the data.
- */
- public JaspellTernarySearchTrie(File file, boolean compression)
- throws IOException {
- this();
- BufferedReader in;
- if (compression)
- in = new BufferedReader(new InputStreamReader(new GZIPInputStream(
- new FileInputStream(file))));
- else in = new BufferedReader(new InputStreamReader((new FileInputStream(
- file))));
- String word;
- int pos;
- Float occur, one = new Float(1);
- int numWords = 0;
- while ((word = in.readLine()) != null) {
- numWords++;
- pos = word.indexOf("\t");
- occur = one;
- if (pos != -1) {
- occur = Float.parseFloat(word.substring(pos + 1).trim());
- word = word.substring(0, pos);
- }
- String key = word.toLowerCase();
- if (rootNode == null) {
- rootNode = new TSTNode(key.charAt(0), null);
- }
- TSTNode node = null;
- if (key.length() > 0 && rootNode != null) {
- TSTNode currentNode = rootNode;
- int charIndex = 0;
- while (true) {
- if (currentNode == null) break;
- int charComp = compareCharsAlphabetically(key.charAt(charIndex),
- currentNode.splitchar);
- if (charComp == 0) {
- charIndex++;
- if (charIndex == key.length()) {
- node = currentNode;
- break;
- }
- currentNode = currentNode.relatives[TSTNode.EQKID];
- } else if (charComp < 0) {
- currentNode = currentNode.relatives[TSTNode.LOKID];
- } else {
- currentNode = currentNode.relatives[TSTNode.HIKID];
- }
- }
- Float occur2 = null;
- if (node != null) occur2 = ((Float) (node.data));
- if (occur2 != null) {
- occur += occur2.floatValue();
- }
- currentNode = getOrCreateNode(word.trim().toLowerCase());
- currentNode.data = occur;
- }
- }
- in.close();
- }
-
- /**
- * Deletes the node passed in as an argument. If this node has non-null data,
- * then both the node and the data will be deleted. It also deletes any other
- * nodes in the trie that are no longer needed after the deletion of the node.
- *
- *@param nodeToDelete
- * The node to delete.
- */
- private void deleteNode(TSTNode nodeToDelete) {
- if (nodeToDelete == null) {
- return;
- }
- nodeToDelete.data = null;
- while (nodeToDelete != null) {
- nodeToDelete = deleteNodeRecursion(nodeToDelete);
- // deleteNodeRecursion(nodeToDelete);
- }
- }
-
- /**
- * Recursively visits each node to be deleted.
- *
- * To delete a node, first set its data to null, then pass it into this
- * method, then pass the node returned by this method into this method (make
- * sure you don't delete the data of any of the nodes returned from this
- * method!) and continue in this fashion until the node returned by this
- * method is null.
- *
- * The TSTNode instance returned by this method will be next node to be
- * operated on by deleteNodeRecursion (This emulates recursive
- * method call while avoiding the JVM overhead normally associated with a
- * recursive method.)
- *
- *@param currentNode
- * The node to delete.
- *@return The next node to be called in deleteNodeRecursion.
- */
- private TSTNode deleteNodeRecursion(TSTNode currentNode) {
- if (currentNode == null) {
- return null;
- }
- if (currentNode.relatives[TSTNode.EQKID] != null
- || currentNode.data != null) {
- return null;
- }
- // can't delete this node if it has a non-null eq kid or data
- TSTNode currentParent = currentNode.relatives[TSTNode.PARENT];
- boolean lokidNull = currentNode.relatives[TSTNode.LOKID] == null;
- boolean hikidNull = currentNode.relatives[TSTNode.HIKID] == null;
- int childType;
- if (currentParent.relatives[TSTNode.LOKID] == currentNode) {
- childType = TSTNode.LOKID;
- } else if (currentParent.relatives[TSTNode.EQKID] == currentNode) {
- childType = TSTNode.EQKID;
- } else if (currentParent.relatives[TSTNode.HIKID] == currentNode) {
- childType = TSTNode.HIKID;
- } else {
- rootNode = null;
- return null;
- }
- if (lokidNull && hikidNull) {
- currentParent.relatives[childType] = null;
- return currentParent;
- }
- if (lokidNull) {
- currentParent.relatives[childType] = currentNode.relatives[TSTNode.HIKID];
- currentNode.relatives[TSTNode.HIKID].relatives[TSTNode.PARENT] = currentParent;
- return currentParent;
- }
- if (hikidNull) {
- currentParent.relatives[childType] = currentNode.relatives[TSTNode.LOKID];
- currentNode.relatives[TSTNode.LOKID].relatives[TSTNode.PARENT] = currentParent;
- return currentParent;
- }
- int deltaHi = currentNode.relatives[TSTNode.HIKID].splitchar
- - currentNode.splitchar;
- int deltaLo = currentNode.splitchar
- - currentNode.relatives[TSTNode.LOKID].splitchar;
- int movingKid;
- TSTNode targetNode;
- if (deltaHi == deltaLo) {
- if (Math.random() < 0.5) {
- deltaHi++;
- } else {
- deltaLo++;
- }
- }
- if (deltaHi > deltaLo) {
- movingKid = TSTNode.HIKID;
- targetNode = currentNode.relatives[TSTNode.LOKID];
- } else {
- movingKid = TSTNode.LOKID;
- targetNode = currentNode.relatives[TSTNode.HIKID];
- }
- while (targetNode.relatives[movingKid] != null) {
- targetNode = targetNode.relatives[movingKid];
- }
- targetNode.relatives[movingKid] = currentNode.relatives[movingKid];
- currentParent.relatives[childType] = targetNode;
- targetNode.relatives[TSTNode.PARENT] = currentParent;
- if (!lokidNull) {
- currentNode.relatives[TSTNode.LOKID] = null;
- }
- if (!hikidNull) {
- currentNode.relatives[TSTNode.HIKID] = null;
- }
- return currentParent;
- }
-
- /**
- * Retrieve the object indexed by a key.
- *
- *@param key
- * A String index.
- *@return The object retrieved from the Ternary Search Trie.
- */
- public Object get(String key) {
- TSTNode node = getNode(key.trim().toLowerCase());
- if (node == null) {
- return null;
- }
- return node.data;
- }
-
- /**
- * Retrieve the Float indexed by key, increment it by one unit
- * and store the new Float.
- *
- *@param key
- * A String index.
- *@return The Float retrieved from the Ternary Search Trie.
- */
- public Float getAndIncrement(String key) {
- String key2 = key.trim().toLowerCase();
- TSTNode node = getNode(key2);
- if (node == null) {
- return null;
- }
- Float aux = (Float) (node.data);
- if (aux == null) {
- aux = new Float(1);
- } else {
- aux = new Float(aux.intValue() + 1);
- }
- put(key2, aux);
- return aux;
- }
-
- /**
- * Returns the key that indexes the node argument.
- *
- *@param node
- * The node whose index is to be calculated.
- *@return The String that indexes the node argument.
- */
- protected String getKey(TSTNode node) {
- StringBuffer getKeyBuffer = new StringBuffer();
- getKeyBuffer.setLength(0);
- getKeyBuffer.append("" + node.splitchar);
- TSTNode currentNode;
- TSTNode lastNode;
- currentNode = node.relatives[TSTNode.PARENT];
- lastNode = node;
- while (currentNode != null) {
- if (currentNode.relatives[TSTNode.EQKID] == lastNode) {
- getKeyBuffer.append("" + currentNode.splitchar);
- }
- lastNode = currentNode;
- currentNode = currentNode.relatives[TSTNode.PARENT];
- }
- getKeyBuffer.reverse();
- return getKeyBuffer.toString();
- }
-
- /**
- * Returns the node indexed by key, or null if that node doesn't
- * exist. Search begins at root node.
- *
- *@param key
- * A String that indexes the node that is returned.
- *@return The node object indexed by key. This object is an instance of an
- * inner class named TernarySearchTrie.TSTNode.
- */
- public TSTNode getNode(String key) {
- return getNode(key, rootNode);
- }
-
- /**
- * Returns the node indexed by key, or null if that node doesn't
- * exist. The search begins at root node.
- *
- *@param key2
- * A String that indexes the node that is returned.
- *@param startNode
- * The top node defining the subtrie to be searched.
- *@return The node object indexed by key. This object is an instance of an
- * inner class named TernarySearchTrie.TSTNode.
- */
- protected TSTNode getNode(String key2, TSTNode startNode) {
- String key = key2.trim().toLowerCase();
- if (key == null || startNode == null || key.length() == 0) {
- return null;
- }
- TSTNode currentNode = startNode;
- int charIndex = 0;
- while (true) {
- if (currentNode == null) {
- return null;
- }
- int charComp = compareCharsAlphabetically(key.charAt(charIndex),
- currentNode.splitchar);
- if (charComp == 0) {
- charIndex++;
- if (charIndex == key.length()) {
- return currentNode;
- }
- currentNode = currentNode.relatives[TSTNode.EQKID];
- } else if (charComp < 0) {
- currentNode = currentNode.relatives[TSTNode.LOKID];
- } else {
- currentNode = currentNode.relatives[TSTNode.HIKID];
- }
- }
- }
-
- /**
- * Returns the node indexed by key, creating that node if it doesn't exist,
- * and creating any required intermediate nodes if they don't exist.
- *
- *@param key
- * A String that indexes the node that is returned.
- *@return The node object indexed by key. This object is an instance of an
- * inner class named TernarySearchTrie.TSTNode.
- *@exception NullPointerException
- * If the key is null.
- *@exception IllegalArgumentException
- * If the key is an empty String.
- */
- protected TSTNode getOrCreateNode(String key) throws NullPointerException,
- IllegalArgumentException {
- if (key == null) {
- throw new NullPointerException(
- "attempt to get or create node with null key");
- }
- if (key.length() == 0) {
- throw new IllegalArgumentException(
- "attempt to get or create node with key of zero length");
- }
- if (rootNode == null) {
- rootNode = new TSTNode(key.charAt(0), null);
- }
- TSTNode currentNode = rootNode;
- int charIndex = 0;
- while (true) {
- int charComp = compareCharsAlphabetically(key.charAt(charIndex),
- currentNode.splitchar);
- if (charComp == 0) {
- charIndex++;
- if (charIndex == key.length()) {
- return currentNode;
- }
- if (currentNode.relatives[TSTNode.EQKID] == null) {
- currentNode.relatives[TSTNode.EQKID] = new TSTNode(key
- .charAt(charIndex), currentNode);
- }
- currentNode = currentNode.relatives[TSTNode.EQKID];
- } else if (charComp < 0) {
- if (currentNode.relatives[TSTNode.LOKID] == null) {
- currentNode.relatives[TSTNode.LOKID] = new TSTNode(key
- .charAt(charIndex), currentNode);
- }
- currentNode = currentNode.relatives[TSTNode.LOKID];
- } else {
- if (currentNode.relatives[TSTNode.HIKID] == null) {
- currentNode.relatives[TSTNode.HIKID] = new TSTNode(key
- .charAt(charIndex), currentNode);
- }
- currentNode = currentNode.relatives[TSTNode.HIKID];
- }
- }
- }
-
- /**
- * Returns a List of keys that almost match the argument key.
- * Keys returned will have exactly diff characters that do not match the
- * target key, where diff is equal to the last value passed in as an argument
- * to the setMatchAlmostDiff method.
- *
- * If the matchAlmost method is called before the
- * setMatchAlmostDiff method has been called for the first time,
- * then diff = 0.
- *
- *@param key
- * The target key.
- *@return A List with the results.
- */
- public List matchAlmost(String key) {
- return matchAlmost(key, defaultNumReturnValues);
- }
-
- /**
- * Returns a List of keys that almost match the argument key.
- * Keys returned will have exactly diff characters that do not match the
- * target key, where diff is equal to the last value passed in as an argument
- * to the setMatchAlmostDiff method.
- *
- * If the matchAlmost method is called before the
- * setMatchAlmostDiff method has been called for the first time,
- * then diff = 0.
- *
- *@param key
- * The target key.
- *@param numReturnValues
- * The maximum number of values returned by this method.
- *@return A List with the results
- */
- public List matchAlmost(String key, int numReturnValues) {
- return matchAlmostRecursion(rootNode, 0, matchAlmostDiff, key,
- ((numReturnValues < 0) ? -1 : numReturnValues), new Vector(), false);
- }
-
- /**
- * Recursivelly vists the nodes in order to find the ones that almost match a
- * given key.
- *
- *@param currentNode
- * The current node.
- *@param charIndex
- * The current char.
- *@param d
- * The number of differences so far.
- *@param matchAlmostNumReturnValues
- * The maximum number of values in the result List.
- *@param matchAlmostResult2
- * The results so far.
- *@param upTo
- * If true all keys having up to and including matchAlmostDiff
- * mismatched letters will be included in the result (including a key
- * that is exactly the same as the target string) otherwise keys will
- * be included in the result only if they have exactly
- * matchAlmostDiff number of mismatched letters.
- *@param matchAlmostKey
- * The key being searched.
- *@return A List with the results.
- */
- private List matchAlmostRecursion(TSTNode currentNode, int charIndex,
- int d, String matchAlmostKey, int matchAlmostNumReturnValues,
- List matchAlmostResult2, boolean upTo) {
- if ((currentNode == null)
- || (matchAlmostNumReturnValues != -1 && matchAlmostResult2.size() >= matchAlmostNumReturnValues)
- || (d < 0) || (charIndex >= matchAlmostKey.length())) {
- return matchAlmostResult2;
- }
- int charComp = compareCharsAlphabetically(matchAlmostKey.charAt(charIndex),
- currentNode.splitchar);
- List matchAlmostResult = matchAlmostResult2;
- if ((d > 0) || (charComp < 0)) {
- matchAlmostResult = matchAlmostRecursion(
- currentNode.relatives[TSTNode.LOKID], charIndex, d,
- matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult,
- upTo);
- }
- int nextD = (charComp == 0) ? d : d - 1;
- boolean cond = (upTo) ? (nextD >= 0) : (nextD == 0);
- if ((matchAlmostKey.length() == charIndex + 1) && cond
- && (currentNode.data != null)) {
- matchAlmostResult.add(getKey(currentNode));
- }
- matchAlmostResult = matchAlmostRecursion(
- currentNode.relatives[TSTNode.EQKID], charIndex + 1, nextD,
- matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, upTo);
- if ((d > 0) || (charComp > 0)) {
- matchAlmostResult = matchAlmostRecursion(
- currentNode.relatives[TSTNode.HIKID], charIndex, d,
- matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult,
- upTo);
- }
- return matchAlmostResult;
- }
-
- /**
- * Returns an alphabetical List of all keys in the trie that
- * begin with a given prefix. Only keys for nodes having non-null data are
- * included in the List.
- *
- *@param prefix
- * Each key returned from this method will begin with the characters
- * in prefix.
- *@return A List with the results.
- */
- public List matchPrefix(String prefix) {
- return matchPrefix(prefix, defaultNumReturnValues);
- }
-
- /**
- * Returns an alphabetical List of all keys in the trie that
- * begin with a given prefix. Only keys for nodes having non-null data are
- * included in the List.
- *
- *@param prefix
- * Each key returned from this method will begin with the characters
- * in prefix.
- *@param numReturnValues
- * The maximum number of values returned from this method.
- *@return A List with the results
- */
- public List matchPrefix(String prefix, int numReturnValues) {
- Vector sortKeysResult = new Vector();
- TSTNode startNode = getNode(prefix);
- if (startNode == null) {
- return sortKeysResult;
- }
- if (startNode.data != null) {
- sortKeysResult.addElement(getKey(startNode));
- }
- return sortKeysRecursion(startNode.relatives[TSTNode.EQKID],
- ((numReturnValues < 0) ? -1 : numReturnValues), sortKeysResult);
- }
-
- /**
- * Returns the number of nodes in the trie that have non-null data.
- *
- *@return The number of nodes in the trie that have non-null data.
- */
- public int numDataNodes() {
- return numDataNodes(rootNode);
- }
-
- /**
- * Returns the number of nodes in the subtrie below and including the starting
- * node. The method counts only nodes that have non-null data.
- *
- *@param startingNode
- * The top node of the subtrie. the node that defines the subtrie.
- *@return The total number of nodes in the subtrie.
- */
- protected int numDataNodes(TSTNode startingNode) {
- return recursiveNodeCalculator(startingNode, true, 0);
- }
-
- /**
- * Returns the total number of nodes in the trie. The method counts nodes
- * whether or not they have data.
- *
- *@return The total number of nodes in the trie.
- */
- public int numNodes() {
- return numNodes(rootNode);
- }
-
- /**
- * Returns the total number of nodes in the subtrie below and including the
- * starting Node. The method counts nodes whether or not they have data.
- *
- *@param startingNode
- * The top node of the subtrie. The node that defines the subtrie.
- *@return The total number of nodes in the subtrie.
- */
- protected int numNodes(TSTNode startingNode) {
- return recursiveNodeCalculator(startingNode, false, 0);
- }
-
- /**
- * Stores a value in the trie. The value may be retrieved using the key.
- *
- *@param key
- * A String that indexes the object to be stored.
- *@param value
- * The object to be stored in the Trie.
- */
- public void put(String key, Object value) {
- getOrCreateNode(key.trim().toLowerCase()).data = value;
- }
-
- /**
- * Recursivelly visists each node to calculate the number of nodes.
- *
- *@param currentNode
- * The current node.
- *@param checkData
- * If true we check the data to be different of null.
- *@param numNodes2
- * The number of nodes so far.
- *@return The number of nodes accounted.
- */
- private int recursiveNodeCalculator(TSTNode currentNode, boolean checkData,
- int numNodes2) {
- if (currentNode == null) {
- return numNodes2;
- }
- int numNodes = recursiveNodeCalculator(
- currentNode.relatives[TSTNode.LOKID], checkData, numNodes2);
- numNodes = recursiveNodeCalculator(currentNode.relatives[TSTNode.EQKID],
- checkData, numNodes);
- numNodes = recursiveNodeCalculator(currentNode.relatives[TSTNode.HIKID],
- checkData, numNodes);
- if (checkData) {
- if (currentNode.data != null) {
- numNodes++;
- }
- } else {
- numNodes++;
- }
- return numNodes;
- }
-
- /**
- * Removes the value indexed by key. Also removes all nodes that are rendered
- * unnecessary by the removal of this data.
- *
- *@param key
- * A string that indexes the object to be removed from
- * the Trie.
- */
- public void remove(String key) {
- deleteNode(getNode(key.trim().toLowerCase()));
- }
-
- /**
- * Sets the number of characters by which words can differ from target word
- * when calling the matchAlmost method.
- *
- * Arguments less than 0 will set the char difference to 0, and arguments
- * greater than 3 will set the char difference to 3.
- *
- *@param diff
- * The number of characters by which words can differ from target
- * word.
- */
- public void setMatchAlmostDiff(int diff) {
- if (diff < 0) {
- matchAlmostDiff = 0;
- } else if (diff > 3) {
- matchAlmostDiff = 3;
- } else {
- matchAlmostDiff = diff;
- }
- }
-
- /**
- * Sets the default maximum number of values returned from the
- * matchPrefix and matchAlmost methods.
- *
- * The value should be set this to -1 to get an unlimited number of return
- * values. note that the methods mentioned above provide overloaded versions
- * that allow you to specify the maximum number of return values, in which
- * case this value is temporarily overridden.
- *
- **@param num
- * The number of values that will be returned when calling the
- * methods above.
- */
- public void setNumReturnValues(int num) {
- defaultNumReturnValues = (num < 0) ? -1 : num;
- }
-
- /**
- * Returns keys sorted in alphabetical order. This includes the start Node and
- * all nodes connected to the start Node.
- *
- * The number of keys returned is limited to numReturnValues. To get a list
- * that isn't limited in size, set numReturnValues to -1.
- *
- *@param startNode
- * The top node defining the subtrie to be searched.
- *@param numReturnValues
- * The maximum number of values returned from this method.
- *@return A List with the results.
- */
- protected List sortKeys(TSTNode startNode, int numReturnValues) {
- return sortKeysRecursion(startNode, ((numReturnValues < 0) ? -1
- : numReturnValues), new Vector());
- }
-
- /**
- * Returns keys sorted in alphabetical order. This includes the current Node
- * and all nodes connected to the current Node.
- *
- * Sorted keys will be appended to the end of the resulting List.
- * The result may be empty when this method is invoked, but may not be
- * null.
- *
- *@param currentNode
- * The current node.
- *@param sortKeysNumReturnValues
- * The maximum number of values in the result.
- *@param sortKeysResult2
- * The results so far.
- *@return A List with the results.
- */
- private List sortKeysRecursion(TSTNode currentNode,
- int sortKeysNumReturnValues, List sortKeysResult2) {
- if (currentNode == null) {
- return sortKeysResult2;
- }
- List sortKeysResult = sortKeysRecursion(
- currentNode.relatives[TSTNode.LOKID], sortKeysNumReturnValues,
- sortKeysResult2);
- if (sortKeysNumReturnValues != -1
- && sortKeysResult.size() >= sortKeysNumReturnValues) {
- return sortKeysResult;
- }
- if (currentNode.data != null) {
- sortKeysResult.add(getKey(currentNode));
- }
- sortKeysResult = sortKeysRecursion(currentNode.relatives[TSTNode.EQKID],
- sortKeysNumReturnValues, sortKeysResult);
- return sortKeysRecursion(currentNode.relatives[TSTNode.HIKID],
- sortKeysNumReturnValues, sortKeysResult);
- }
-
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTAutocomplete.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTAutocomplete.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTAutocomplete.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTAutocomplete.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,142 +0,0 @@
-package org.apache.solr.spelling.suggest.tst;
-
-import java.util.*;
-
-public class TSTAutocomplete {
-
- /**
- * Inserting keys in TST in the order middle,small,big (lexicographic measure)
- * recursively creates a balanced tree which reduces insertion and search
- * times significantly.
- *
- * @param tokens
- * Sorted list of keys to be inserted in TST.
- * @param lo
- * stores the lower index of current list.
- * @param hi
- * stores the higher index of current list.
- * @param root
- * a reference object to root of TST.
- */
- public void balancedTree(Object[] tokens, Object[] vals, int lo, int hi,
- TernaryTreeNode root) {
- if (lo > hi) return;
- int mid = (lo + hi) / 2;
- root = insert(root, (String) tokens[mid], vals[mid], 0);
- balancedTree(tokens, vals, lo, mid - 1, root);
- balancedTree(tokens, vals, mid + 1, hi, root);
- }
-
- /**
- * Inserts a key in TST creating a series of Binary Search Trees at each node.
- * The key is actually stored across the eqKid of each node in a successive
- * manner.
- *
- * @param currentNode
- * a reference node where the insertion will take currently.
- * @param s
- * key to be inserted in TST.
- * @param x
- * index of character in key to be inserted currently.
- * @return currentNode The new reference to root node of TST
- */
- public TernaryTreeNode insert(TernaryTreeNode currentNode, String s,
- Object val, int x) {
- if (s == null || s.length() <= x) {
- return currentNode;
- }
- if (currentNode == null) {
- TernaryTreeNode newNode = new TernaryTreeNode();
- newNode.splitchar = s.charAt(x);
- currentNode = newNode;
- if (x < s.length() - 1) {
- currentNode.eqKid = insert(currentNode.eqKid, s, val, x + 1);
- } else {
- currentNode.token = s;
- currentNode.val = val;
- return currentNode;
- }
- } else if (currentNode.splitchar > s.charAt(x)) {
- currentNode.loKid = insert(currentNode.loKid, s, val, x);
- } else if (currentNode.splitchar == s.charAt(x)) {
- if (x < s.length() - 1) {
- currentNode.eqKid = insert(currentNode.eqKid, s, val, x + 1);
- } else {
- currentNode.token = s;
- currentNode.val = val;
- return currentNode;
- }
- } else {
- currentNode.hiKid = insert(currentNode.hiKid, s, val, x);
- }
- return currentNode;
- }
-
- /**
- * Auto-completes a given prefix query using Depth-First Search with the end
- * of prefix as source node each time finding a new leaf to get a complete key
- * to be added in the suggest list.
- *
- * @param root
- * a reference to root node of TST.
- * @param s
- * prefix query to be auto-completed.
- * @param x
- * index of current character to be searched while traversing through
- * the prefix in TST.
- * @return suggest list of auto-completed keys for the given prefix query.
- */
- public ArrayList prefixCompletion(TernaryTreeNode root,
- String s, int x) {
-
- TernaryTreeNode p = root;
- ArrayList suggest = new ArrayList();
-
- while (p != null) {
- if (s.charAt(x) < p.splitchar) {
- p = p.loKid;
- } else if (s.charAt(x) == p.splitchar) {
- if (x == s.length() - 1) {
- break;
- } else {
- x++;
- }
- p = p.eqKid;
- } else {
- p = p.hiKid;
- }
- }
-
- if (p == null) return suggest;
- if (p.eqKid == null && p.token == null) return suggest;
- if (p.eqKid == null && p.token != null) {
- suggest.add(p);
- return suggest;
- }
-
- if (p.token != null) {
- suggest.add(p);
- }
- p = p.eqKid;
-
- Stack st = new Stack();
- st.push(p);
- while (!st.empty()) {
- TernaryTreeNode top = st.peek();
- st.pop();
- if (top.token != null) {
- suggest.add(top);
- }
- if (top.eqKid != null) {
- st.push(top.eqKid);
- }
- if (top.loKid != null) {
- st.push(top.loKid);
- }
- if (top.hiKid != null) {
- st.push(top.hiKid);
- }
- }
- return suggest;
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookup.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookup.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookup.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookup.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,180 +0,0 @@
-package org.apache.solr.spelling.suggest.tst;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.solr.common.util.NamedList;
-import org.apache.solr.core.SolrCore;
-import org.apache.solr.spelling.suggest.Lookup;
-import org.apache.solr.spelling.suggest.SortedTermFreqIteratorWrapper;
-import org.apache.solr.util.SortedIterator;
-import org.apache.solr.util.TermFreqIterator;
-
-public class TSTLookup extends Lookup {
- TernaryTreeNode root = new TernaryTreeNode();
- TSTAutocomplete autocomplete = new TSTAutocomplete();
-
- @Override
- public void init(NamedList config, SolrCore core) {
- }
-
- @Override
- public void build(TermFreqIterator tfit) throws IOException {
- root = new TernaryTreeNode();
- // buffer first
- if (!(tfit instanceof SortedIterator)) {
- // make sure it's sorted
- tfit = new SortedTermFreqIteratorWrapper(tfit);
- }
-
- ArrayList tokens = new ArrayList();
- ArrayList vals = new ArrayList();
- while (tfit.hasNext()) {
- tokens.add(tfit.next());
- vals.add(new Float(tfit.freq()));
- }
- autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
- }
-
- @Override
- public boolean add(String key, Object value) {
- autocomplete.insert(root, key, value, 0);
- // XXX we don't know if a new node was created
- return true;
- }
-
- @Override
- public Object get(String key) {
- List list = autocomplete.prefixCompletion(root, key, 0);
- if (list == null || list.isEmpty()) {
- return null;
- }
- for (TernaryTreeNode n : list) {
- if (n.token.equals(key)) {
- return n.val;
- }
- }
- return null;
- }
-
- @Override
- public List lookup(String key, boolean onlyMorePopular, int num) {
- List list = autocomplete.prefixCompletion(root, key, 0);
- List res = new ArrayList();
- if (list == null || list.size() == 0) {
- return res;
- }
- int maxCnt = Math.min(num, list.size());
- if (onlyMorePopular) {
- LookupPriorityQueue queue = new LookupPriorityQueue(num);
- for (TernaryTreeNode ttn : list) {
- queue.insertWithOverflow(new LookupResult(ttn.token, (Float)ttn.val));
- }
- for (LookupResult lr : queue.getResults()) {
- res.add(lr);
- }
- } else {
- for (int i = 0; i < maxCnt; i++) {
- TernaryTreeNode ttn = list.get(i);
- res.add(new LookupResult(ttn.token, (Float)ttn.val));
- }
- }
- return res;
- }
-
- public static final String FILENAME = "tst.dat";
-
- private static final byte LO_KID = 0x01;
- private static final byte EQ_KID = 0x02;
- private static final byte HI_KID = 0x04;
- private static final byte HAS_TOKEN = 0x08;
- private static final byte HAS_VALUE = 0x10;
-
- @Override
- public synchronized boolean load(File storeDir) throws IOException {
- File data = new File(storeDir, FILENAME);
- if (!data.exists() || !data.canRead()) {
- return false;
- }
- DataInputStream in = new DataInputStream(new FileInputStream(data));
- root = new TernaryTreeNode();
- try {
- readRecursively(in, root);
- } finally {
- in.close();
- }
- return true;
- }
-
- // pre-order traversal
- private void readRecursively(DataInputStream in, TernaryTreeNode node) throws IOException {
- node.splitchar = in.readChar();
- byte mask = in.readByte();
- if ((mask & HAS_TOKEN) != 0) {
- node.token = in.readUTF();
- }
- if ((mask & HAS_VALUE) != 0) {
- node.val = new Float(in.readFloat());
- }
- if ((mask & LO_KID) != 0) {
- node.loKid = new TernaryTreeNode();
- readRecursively(in, node.loKid);
- }
- if ((mask & EQ_KID) != 0) {
- node.eqKid = new TernaryTreeNode();
- readRecursively(in, node.eqKid);
- }
- if ((mask & HI_KID) != 0) {
- node.hiKid = new TernaryTreeNode();
- readRecursively(in, node.hiKid);
- }
- }
-
- @Override
- public synchronized boolean store(File storeDir) throws IOException {
- if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
- return false;
- }
- File data = new File(storeDir, FILENAME);
- DataOutputStream out = new DataOutputStream(new FileOutputStream(data));
- try {
- writeRecursively(out, root);
- out.flush();
- } finally {
- out.close();
- }
- return true;
- }
-
- // pre-order traversal
- private void writeRecursively(DataOutputStream out, TernaryTreeNode node) throws IOException {
- // write out the current node
- out.writeChar(node.splitchar);
- // prepare a mask of kids
- byte mask = 0;
- if (node.eqKid != null) mask |= EQ_KID;
- if (node.loKid != null) mask |= LO_KID;
- if (node.hiKid != null) mask |= HI_KID;
- if (node.token != null) mask |= HAS_TOKEN;
- if (node.val != null) mask |= HAS_VALUE;
- out.writeByte(mask);
- if (node.token != null) out.writeUTF(node.token);
- if (node.val != null) out.writeFloat((Float)node.val);
- // recurse and write kids
- if (node.loKid != null) {
- writeRecursively(out, node.loKid);
- }
- if (node.eqKid != null) {
- writeRecursively(out, node.eqKid);
- }
- if (node.hiKid != null) {
- writeRecursively(out, node.hiKid);
- }
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookupFactory.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookupFactory.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookupFactory.java 1969-12-31 19:00:00.000000000 -0500
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TSTLookupFactory.java 2011-05-22 18:00:18.000000000 -0400
@@ -0,0 +1,35 @@
+package org.apache.solr.spelling.suggest.tst;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.tst.TSTLookup;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.spelling.suggest.LookupFactory;
+
+/**
+ * Factory for {@link TSTLookup}
+ */
+public class TSTLookupFactory extends LookupFactory {
+
+ @Override
+ public Lookup create(NamedList params, SolrCore core) {
+ return new TSTLookup();
+ }
+}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TernaryTreeNode.java lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TernaryTreeNode.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TernaryTreeNode.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/spelling/suggest/tst/TernaryTreeNode.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,25 +0,0 @@
-package org.apache.solr.spelling.suggest.tst;
-
-/**
- * The class creates a TST node.
- */
-
-public class TernaryTreeNode {
- /** the character stored by a node. */
- char splitchar;
- /** a reference object to the node containing character smaller than this node's character. */
- TernaryTreeNode loKid;
- /**
- * a reference object to the node containing character next to this node's character as
- * occurring in the inserted token.
- */
- TernaryTreeNode eqKid;
- /** a reference object to the node containing character higher than this node's character. */
- TernaryTreeNode hiKid;
- /**
- * used by leaf nodes to store the complete tokens to be added to suggest list while
- * auto-completing the prefix.
- */
- String token;
- Object val;
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java lucene-trunk/solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,133 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.solr.util;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.search.spell.Dictionary;
-import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.BytesRef;
-
-/**
- * HighFrequencyDictionary: terms taken from the given field
- * of a Lucene index, which appear in a number of documents
- * above a given threshold.
- *
- * Threshold is a value in [0..1] representing the minimum
- * number of documents (of the total) where a term should appear.
- *
- * Based on LuceneDictionary.
- */
-public class HighFrequencyDictionary implements Dictionary {
- private IndexReader reader;
- private String field;
- private float thresh;
-
- public HighFrequencyDictionary(IndexReader reader, String field, float thresh) {
- this.reader = reader;
- this.field = StringHelper.intern(field);
- this.thresh = thresh;
- }
-
- public final Iterator getWordsIterator() {
- return new HighFrequencyIterator();
- }
-
- final class HighFrequencyIterator implements TermFreqIterator, SortedIterator {
- private TermsEnum termsEnum;
- private BytesRef actualTerm;
- private boolean hasNextCalled;
- private int minNumDocs;
-
- HighFrequencyIterator() {
- try {
- Terms terms = MultiFields.getTerms(reader, field);
- if (terms != null) {
- termsEnum = terms.iterator();
- }
- minNumDocs = (int)(thresh * (float)reader.numDocs());
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- private boolean isFrequent(int freq) {
- return freq >= minNumDocs;
- }
-
- public float freq() {
- try {
- return termsEnum.docFreq();
- } catch (IOException ioe) {
- throw new RuntimeException(ioe);
- }
- }
-
- public String next() {
- if (!hasNextCalled && !hasNext()) {
- return null;
- }
- hasNextCalled = false;
-
- return (actualTerm != null) ? actualTerm.utf8ToString() : null;
- }
-
- public boolean hasNext() {
- if (hasNextCalled) {
- return actualTerm != null;
- }
- hasNextCalled = true;
-
- if (termsEnum == null) {
- return false;
- }
-
- while(true) {
-
- try {
- actualTerm = termsEnum.next();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- // if there are no words return false
- if (actualTerm == null) {
- return false;
- }
-
- // got a valid term, does it pass the threshold?
- try {
- if (isFrequent(termsEnum.docFreq())) {
- return true;
- }
- } catch (IOException ioe) {
- throw new RuntimeException(ioe);
- }
- }
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/util/SortedIterator.java lucene-trunk/solr/src/java/org/apache/solr/util/SortedIterator.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/util/SortedIterator.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/util/SortedIterator.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,11 +0,0 @@
-package org.apache.solr.util;
-
-import java.util.Iterator;
-
-/**
- * Marker interface to signal that elements coming from {@link Iterator}
- * come in ascending lexicographic order.
- */
-public interface SortedIterator {
-
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/java/org/apache/solr/util/TermFreqIterator.java lucene-trunk/solr/src/java/org/apache/solr/util/TermFreqIterator.java
--- lucene-clean-trunk/solr/src/java/org/apache/solr/util/TermFreqIterator.java 2011-05-22 12:37:52.000000000 -0400
+++ lucene-trunk/solr/src/java/org/apache/solr/util/TermFreqIterator.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,33 +0,0 @@
-package org.apache.solr.util;
-
-import java.util.Iterator;
-
-public interface TermFreqIterator extends Iterator {
-
- public float freq();
-
- public static class TermFreqIteratorWrapper implements TermFreqIterator {
- private Iterator wrapped;
-
- public TermFreqIteratorWrapper(Iterator wrapped) {
- this.wrapped = wrapped;
- }
-
- public float freq() {
- return 1.0f;
- }
-
- public boolean hasNext() {
- return wrapped.hasNext();
- }
-
- public String next() {
- return wrapped.next().toString();
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
-
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/Average.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/Average.java
--- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/Average.java 2011-05-22 12:37:50.000000000 -0400
+++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/Average.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,52 +0,0 @@
-package org.apache.solr.spelling.suggest;
-
-import java.util.List;
-import java.util.Locale;
-
-/**
- * Average with standard deviation.
- */
-final class Average
-{
- /**
- * Average (in milliseconds).
- */
- public final double avg;
-
- /**
- * Standard deviation (in milliseconds).
- */
- public final double stddev;
-
- /**
- *
- */
- Average(double avg, double stddev)
- {
- this.avg = avg;
- this.stddev = stddev;
- }
-
- public String toString()
- {
- return String.format(Locale.ENGLISH, "%.0f [+- %.2f]",
- avg, stddev);
- }
-
- static Average from(List values)
- {
- double sum = 0;
- double sumSquares = 0;
-
- for (double l : values)
- {
- sum += l;
- sumSquares += l * l;
- }
-
- double avg = sum / (double) values.size();
- return new Average(
- (sum / (double) values.size()),
- Math.sqrt(sumSquares / (double) values.size() - avg * avg));
- }
-}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/LookupBenchmarkTest.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/LookupBenchmarkTest.java
--- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/LookupBenchmarkTest.java 2011-05-22 12:37:50.000000000 -0400
+++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/LookupBenchmarkTest.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,230 +0,0 @@
-package org.apache.solr.spelling.suggest;
-
-import java.net.URL;
-import java.util.Collections;
-import java.util.List;
-import java.util.Locale;
-import java.util.Random;
-import java.util.concurrent.Callable;
-
-import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.solr.spelling.suggest.fst.FSTLookup;
-import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
-import org.apache.solr.spelling.suggest.tst.TSTLookup;
-import org.junit.Assert;
-import org.junit.BeforeClass;
-import org.junit.Ignore;
-import org.junit.Test;
-
-import com.google.common.base.Charsets;
-import com.google.common.base.Function;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.io.Resources;
-
-/**
- * Benchmarks tests for implementations of {@link Lookup} interface.
- */
-@Ignore // COMMENT ME TO RUN BENCHMARKS!
-public class LookupBenchmarkTest {
- @SuppressWarnings("unchecked")
- private final List> benchmarkClasses = Lists.newArrayList(
- JaspellLookup.class,
- TSTLookup.class,
- FSTLookup.class);
-
- private final static int rounds = 15;
- private final static int warmup = 5;
-
- private final int num = 7;
- private final boolean onlyMorePopular = true;
-
- private final static Random random = new Random(0xdeadbeef);
-
- /**
- * Input term/weight pairs.
- */
- private static TermFreq [] dictionaryInput;
-
- /**
- * Benchmark term/weight pairs (randomized order).
- */
- private static List benchmarkInput;
-
- /**
- * Loads terms and frequencies from Wikipedia (cached).
- */
- @BeforeClass
- public static void setup() throws Exception {
- List input = readTop50KWiki();
- Collections.shuffle(input, random);
- LookupBenchmarkTest.dictionaryInput = input.toArray(new TermFreq [input.size()]);
- Collections.shuffle(input, random);
- LookupBenchmarkTest.benchmarkInput = input;
- }
-
- /**
- * Collect the multilingual input for benchmarks/ tests.
- */
- public static List readTop50KWiki() throws Exception {
- List input = Lists.newArrayList();
- URL resource = Thread.currentThread().getContextClassLoader().getResource("Top50KWiki.utf8");
- assert resource != null : "Resource missing: Top50KWiki.utf8";
-
- for (String line : Resources.readLines(resource, Charsets.UTF_8)) {
- int tab = line.indexOf('|');
- Assert.assertTrue("No | separator?: " + line, tab >= 0);
- float weight = Float.parseFloat(line.substring(tab + 1));
- String key = line.substring(0, tab);
- input.add(new TermFreq(key, weight));
- }
- return input;
- }
-
- /**
- * Test construction time.
- */
- @Test
- public void testConstructionTime() throws Exception {
- System.err.println("-- construction time");
- for (final Class extends Lookup> cls : benchmarkClasses) {
- BenchmarkResult result = measure(new Callable() {
- public Integer call() throws Exception {
- final Lookup lookup = buildLookup(cls, dictionaryInput);
- return lookup.hashCode();
- }
- });
-
- System.err.println(
- String.format(Locale.ENGLISH, "%-15s input: %d, time[ms]: %s",
- cls.getSimpleName(),
- dictionaryInput.length,
- result.average.toString()));
- }
- }
-
- /**
- * Test memory required for the storage.
- */
- @Test
- public void testStorageNeeds() throws Exception {
- System.err.println("-- RAM consumption");
- final RamUsageEstimator rue = new RamUsageEstimator();
- for (Class extends Lookup> cls : benchmarkClasses) {
- Lookup lookup = buildLookup(cls, dictionaryInput);
- System.err.println(
- String.format(Locale.ENGLISH, "%-15s size[B]:%,13d",
- lookup.getClass().getSimpleName(),
- rue.estimateRamUsage(lookup)));
- }
- }
-
- /**
- * Create {@link Lookup} instance and populate it.
- */
- private Lookup buildLookup(Class extends Lookup> cls, TermFreq[] input) throws Exception {
- Lookup lookup = cls.newInstance();
- lookup.build(new TermFreqArrayIterator(input));
- return lookup;
- }
-
- /**
- * Test performance of lookup on full hits.
- */
- @Test
- public void testPerformanceOnFullHits() throws Exception {
- final int minPrefixLen = 100;
- final int maxPrefixLen = 200;
- runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
- }
-
- /**
- * Test performance of lookup on longer term prefixes (6-9 letters or shorter).
- */
- @Test
- public void testPerformanceOnPrefixes6_9() throws Exception {
- final int minPrefixLen = 6;
- final int maxPrefixLen = 9;
- runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
- }
-
- /**
- * Test performance of lookup on short term prefixes (2-4 letters or shorter).
- */
- @Test
- public void testPerformanceOnPrefixes2_4() throws Exception {
- final int minPrefixLen = 2;
- final int maxPrefixLen = 4;
- runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
- }
-
- /**
- * Run the actual benchmark.
- */
- public void runPerformanceTest(final int minPrefixLen, final int maxPrefixLen,
- final int num, final boolean onlyMorePopular) throws Exception {
- System.err.println(String.format(Locale.ENGLISH,
- "-- prefixes: %d-%d, num: %d, onlyMorePopular: %s",
- minPrefixLen, maxPrefixLen, num, onlyMorePopular));
-
- for (Class extends Lookup> cls : benchmarkClasses) {
- final Lookup lookup = buildLookup(cls, dictionaryInput);
-
- final List input = Lists.newArrayList(Iterables.transform(benchmarkInput, new Function() {
- public String apply(TermFreq tf) {
- return tf.term.substring(0, Math.min(tf.term.length(),
- minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
- }
- }));
-
- BenchmarkResult result = measure(new Callable() {
- public Integer call() throws Exception {
- int v = 0;
- for (String term : input) {
- v += lookup.lookup(term, onlyMorePopular, num).size();
- }
- return v;
- }
- });
-
- System.err.println(
- String.format(Locale.ENGLISH, "%-15s queries: %d, time[ms]: %s, ~qps: %.0f",
- lookup.getClass().getSimpleName(),
- input.size(),
- result.average.toString(),
- input.size() / result.average.avg));
- }
- }
-
- /**
- * Do the measurements.
- */
- private BenchmarkResult measure(Callable callable) {
- final double NANOS_PER_MS = 1000000;
-
- try {
- List times = Lists.newArrayList();
- for (int i = 0; i < warmup + rounds; i++) {
- final long start = System.nanoTime();
- guard = callable.call().intValue();
- times.add((System.nanoTime() - start) / NANOS_PER_MS);
- }
- return new BenchmarkResult(times, warmup, rounds);
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
-
- /** Guard against opts. */
- @SuppressWarnings("unused")
- private static volatile int guard;
-
- private static class BenchmarkResult {
- /** Average time per round (ms). */
- public final Average average;
-
- public BenchmarkResult(List times, int warmup, int rounds) {
- this.average = Average.from(times.subList(warmup, times.size()));
- }
- }
-}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/PersistenceTest.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/PersistenceTest.java
--- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/PersistenceTest.java 2011-05-22 12:37:50.000000000 -0400
+++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/PersistenceTest.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,92 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.spelling.suggest;
-
-import java.io.File;
-
-import org.apache.solr.SolrTestCaseJ4;
-import org.apache.solr.spelling.suggest.fst.FSTLookup;
-import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
-import org.apache.solr.spelling.suggest.tst.TSTLookup;
-import org.junit.Test;
-
-public class PersistenceTest extends SolrTestCaseJ4 {
- public final String[] keys = new String[] {
- "one",
- "two",
- "three",
- "four",
- "oneness",
- "onerous",
- "onesimus",
- "twofold",
- "twonk",
- "thrive",
- "through",
- "threat",
- "foundation",
- "fourier",
- "fourty"};
-
- @Test
- public void testTSTPersistence() throws Exception {
- runTest(TSTLookup.class, true);
- }
-
- @Test
- public void testJaspellPersistence() throws Exception {
- runTest(JaspellLookup.class, true);
- }
-
- @Test
- public void testFSTPersistence() throws Exception {
- runTest(FSTLookup.class, false);
- }
-
- private void runTest(Class extends Lookup> lookupClass,
- boolean supportsExactWeights) throws Exception {
-
- // Add all input keys.
- Lookup lookup = lookupClass.newInstance();
- TermFreq[] keys = new TermFreq[this.keys.length];
- for (int i = 0; i < keys.length; i++)
- keys[i] = new TermFreq(this.keys[i], (float) i);
- lookup.build(new TermFreqArrayIterator(keys));
-
- // Store the suggester.
- File storeDir = new File(TEST_HOME());
- lookup.store(storeDir);
-
- // Re-read it from disk.
- lookup = lookupClass.newInstance();
- lookup.load(storeDir);
-
- // Assert validity.
- float previous = Float.NEGATIVE_INFINITY;
- for (TermFreq k : keys) {
- Float val = (Float) lookup.get(k.term);
- assertNotNull(k.term, val);
-
- if (supportsExactWeights) {
- assertEquals(k.term, Float.valueOf(k.v), val);
- } else {
- assertTrue(val + ">=" + previous, val >= previous);
- previous = val.floatValue();
- }
- }
- }
-}
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreq.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreq.java
--- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreq.java 2011-05-22 12:37:50.000000000 -0400
+++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreq.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,11 +0,0 @@
-package org.apache.solr.spelling.suggest;
-
-public final class TermFreq {
- public final String term;
- public final float v;
-
- public TermFreq(String term, float v) {
- this.term = term;
- this.v = v;
- }
-}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreqArrayIterator.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreqArrayIterator.java
--- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreqArrayIterator.java 2011-05-22 12:37:50.000000000 -0400
+++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/TermFreqArrayIterator.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,40 +0,0 @@
-package org.apache.solr.spelling.suggest;
-
-import java.util.Arrays;
-import java.util.Iterator;
-
-import org.apache.solr.util.TermFreqIterator;
-
-/**
- * A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
- */
-public final class TermFreqArrayIterator implements TermFreqIterator {
- private final Iterator i;
- private TermFreq current;
-
- public TermFreqArrayIterator(Iterator i) {
- this.i = i;
- }
-
- public TermFreqArrayIterator(TermFreq [] i) {
- this(Arrays.asList(i));
- }
-
- public TermFreqArrayIterator(Iterable i) {
- this(i.iterator());
- }
-
- public float freq() {
- return current.v;
- }
-
- public boolean hasNext() {
- return i.hasNext();
- }
-
- public String next() {
- return (current = i.next()).term;
- }
-
- public void remove() { throw new UnsupportedOperationException(); }
-}
\ No newline at end of file
diff -ruN -x .svn -x build lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/fst/FSTLookupTest.java lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/fst/FSTLookupTest.java
--- lucene-clean-trunk/solr/src/test/org/apache/solr/spelling/suggest/fst/FSTLookupTest.java 2011-05-22 12:37:50.000000000 -0400
+++ lucene-trunk/solr/src/test/org/apache/solr/spelling/suggest/fst/FSTLookupTest.java 1969-12-31 19:00:00.000000000 -0500
@@ -1,155 +0,0 @@
-package org.apache.solr.spelling.suggest.fst;
-
-import java.util.Arrays;
-import java.util.List;
-import java.util.Locale;
-import java.util.Random;
-
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.solr.spelling.suggest.Lookup.LookupResult;
-import org.apache.solr.spelling.suggest.LookupBenchmarkTest;
-import org.apache.solr.spelling.suggest.TermFreq;
-import org.apache.solr.spelling.suggest.TermFreqArrayIterator;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-/**
- * Unit tests for {@link FSTLookup}.
- */
-public class FSTLookupTest extends LuceneTestCase {
- public static TermFreq tf(String t, float v) {
- return new TermFreq(t, v);
- }
-
- private FSTLookup lookup;
-
- @Before
- public void prepare() throws Exception {
- final TermFreq[] keys = new TermFreq[] {
- tf("one", 0.5f),
- tf("oneness", 1),
- tf("onerous", 1),
- tf("onesimus", 1),
- tf("two", 1),
- tf("twofold", 1),
- tf("twonk", 1),
- tf("thrive", 1),
- tf("through", 1),
- tf("threat", 1),
- tf("three", 1),
- tf("foundation", 1),
- tf("fourier", 1),
- tf("four", 1),
- tf("fourty", 1),
- tf("xo", 1),
- };
-
- lookup = new FSTLookup();
- lookup.build(new TermFreqArrayIterator(keys));
- }
-
- @Test
- public void testExactMatchHighPriority() throws Exception {
- assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
- }
-
- @Test
- public void testExactMatchLowPriority() throws Exception {
- assertMatchEquals(lookup.lookup("one", true, 2),
- "one/0.0",
- "oneness/1.0");
- }
-
- @Test
- public void testMiss() throws Exception {
- assertMatchEquals(lookup.lookup("xyz", true, 1));
- }
-
- @Test
- public void testAlphabeticWithWeights() throws Exception {
- assertEquals(0, lookup.lookup("xyz", false, 1).size());
- }
-
- @Test
- public void testFullMatchList() throws Exception {
- assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
- "oneness/1.0",
- "onerous/1.0",
- "onesimus/1.0",
- "one/0.0");
- }
-
- @Test
- public void testMultilingualInput() throws Exception {
- List input = LookupBenchmarkTest.readTop50KWiki();
-
- lookup = new FSTLookup();
- lookup.build(new TermFreqArrayIterator(input));
-
- for (TermFreq tf : input) {
- assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null);
- assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key);
- }
- }
-
- @Test
- public void testEmptyInput() throws Exception {
- lookup = new FSTLookup();
- lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
-
- assertMatchEquals(lookup.lookup("", true, 10));
- }
-
- @Test
- public void testRandom() throws Exception {
- List freqs = Lists.newArrayList();
- Random rnd = random;
- for (int i = 0; i < 5000; i++) {
- freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
- }
- lookup = new FSTLookup();
- lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()])));
-
- for (TermFreq tf : freqs) {
- final String term = tf.term;
- for (int i = 1; i < term.length(); i++) {
- String prefix = term.substring(0, i);
- for (LookupResult lr : lookup.lookup(prefix, true, 10)) {
- Assert.assertTrue(lr.key.startsWith(prefix));
- }
- }
- }
- }
-
- private void assertMatchEquals(List res, String... expected) {
- String [] result = new String [res.size()];
- for (int i = 0; i < res.size(); i++)
- result[i] = res.get(i).toString();
-
- if (!Arrays.equals(expected, result)) {
- int colLen = Math.max(maxLen(expected), maxLen(result));
-
- StringBuilder b = new StringBuilder();
- String format = "%" + colLen + "s " + "%" + colLen + "s\n";
- b.append(String.format(Locale.ENGLISH, format, "Expected", "Result"));
- for (int i = 0; i < Math.max(result.length, expected.length); i++) {
- b.append(String.format(Locale.ENGLISH, format,
- i < expected.length ? expected[i] : "--",
- i < result.length ? result[i] : "--"));
- }
-
- System.err.println(b.toString());
- fail("Expected different output:\n" + b.toString());
- }
- }
-
- private int maxLen(String[] result) {
- int len = 0;
- for (String s : result)
- len = Math.max(len, s.length());
- return len;
- }
-}