Index: contrib/regex/build.xml =================================================================== --- contrib/regex/build.xml (revision 766522) +++ contrib/regex/build.xml (working copy) @@ -24,7 +24,7 @@ - + = Regents of the University of California + = University of California, Berkeley + = 1998 + +In the original BSD license, both occurrences of the phrase "COPYRIGHT HOLDERS AND CONTRIBUTORS" in the disclaimer read "REGENTS AND CONTRIBUTORS". + +Here is the license template: + +Copyright (c) , +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + * Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Index: contrib/regex/lib/automaton.jar =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Property changes on: contrib\regex\lib\automaton.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonFuzzyQuery.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonFuzzyQuery.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonFuzzyQuery.java (revision 0) @@ -0,0 +1,139 @@ +package org.apache.lucene.search.regex; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FilteredTermEnum; + +import dk.brics.automaton.Automaton; +import dk.brics.automaton.BasicAutomata; +import dk.brics.automaton.BasicOperations; + +/** + * A query that efficiently searches for all terms within an edit distance of 1. + * + * All matches of at most one insertion, deletion, substitution, or optionally, transposition are matched. + * Mismatches are ranked lower based upon the edit distance. + * + */ +public class AutomatonFuzzyQuery extends AutomatonQuery { + + /** + * Create a new AutomatonFuzzyQuery matching all terms within an edit distance of Term + * By default, transpositions are matched. + * @param term Term to match + */ + public AutomatonFuzzyQuery(Term term) { + this(term, true); + } + + /** + * Create a new AutomatonFuzzyQuery matching all terms within an edit distance of Term + * @param term Term to match + * @param allow_transposition whether to match transpositions + */ + public AutomatonFuzzyQuery(Term term, boolean allow_transposition) { + super(term, toAutomaton(term.text(), allow_transposition)); + } + + /** + * Return an automaton that accepts all 1-character insertions, deletions, and substitutions of s. + * if allow_transposition is set, will also accept transpositions (swapping two characters) + */ + protected static Automaton toAutomaton(String s, boolean allow_transposition) { + List automata = new ArrayList(); + + automata.add(insertionsOf(s)); + automata.add(deletionsOf(s)); + automata.add(substitutionsOf(s)); + if (allow_transposition) + automata.add(transpositionsOf(s)); + + return BasicOperations.union(automata); + } + + /** + * Return an automaton that accepts all 1-character insertions of s (inserting one character) + */ + protected static Automaton insertionsOf(String s) { + List automata = new ArrayList(); + for (int i = 0; i <= s.length(); i++) { + Automaton a = BasicAutomata.makeString(s.substring(0, i)); + a = BasicOperations.concatenate(a, BasicAutomata.makeAnyChar()); + a = BasicOperations.concatenate(a, BasicAutomata.makeString(s.substring(i))); + automata.add(a); + } + return BasicOperations.union(automata); + } + + /** + * Return an automaton that accepts all 1-character deletions of s (deleting one character) + */ + protected static Automaton deletionsOf(String s) { + List automata = new ArrayList(); + for (int i = 0; i < s.length(); i++) { + Automaton a = BasicAutomata.makeString(s.substring(0, i)); + a = BasicOperations.concatenate(a, BasicAutomata.makeString(s.substring(i + 1))); + automata.add(a); + } + return BasicOperations.union(automata); + } + + /** + * Return an automaton that accepts all 1-character substitutions of s (replacing one character) + */ + protected static Automaton substitutionsOf(String s) { + List automata = new ArrayList(); + for (int i = 0; i < s.length(); i++) { + Automaton a = BasicAutomata.makeString(s.substring(0, i)); + a = BasicOperations.concatenate(a, BasicAutomata.makeAnyChar()); + a = BasicOperations.concatenate(a, BasicAutomata.makeString(s.substring(i + 1))); + automata.add(a); + } + return BasicOperations.union(automata); + } + + /** + * Return an automaton that accepts all 1-character transpositions of s (swapping two characters) + */ + protected static Automaton transpositionsOf(String s) { + List automata = new ArrayList(); + for (int i = 1; i < s.length(); i++) { + Automaton a = BasicAutomata.makeString(s.substring(0, i - 1)); + a = BasicOperations.concatenate(a, BasicAutomata.makeChar(s.charAt(i))); + a = BasicOperations.concatenate(a, BasicAutomata.makeChar(s.charAt(i - 1))); + a = BasicOperations.concatenate(a, BasicAutomata.makeString(s.substring(i + 1))); + automata.add(a); + } + return BasicOperations.union(automata); + } + + /** + * Override AutomatonTermEnum to score mismatches. + */ + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new AutomatonFuzzyTermEnum(automaton, field, reader); + } + + /** + * Override difference() method to score exact matches differently than inexact matches. + */ + class AutomatonFuzzyTermEnum extends AutomatonTermEnum { + + AutomatonFuzzyTermEnum(Automaton automaton, String field, IndexReader reader) throws IOException { + super(automaton, field, reader); + } + + public float difference() { + if (currentTerm == null || currentTerm.equals(term)) + return 1.0F; + else + return 1.0F - (1.0F / (Math.max(currentTerm.text().length(), term.text().length()))); + } + + } + +} Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0) @@ -0,0 +1,123 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FilteredTermEnum; +import org.apache.lucene.search.MultiTermQuery; + +import dk.brics.automaton.Automaton; +import dk.brics.automaton.RegExp; + +/** + *

+ * A RegexpQuery that utilizes the BRICS automaton package: http://www.brics.dk/automaton/ + *

+ * + *

+ * Queries are converted to a DFA, and the state machine is used to optimize term enumeration. + *

+ * + *

+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *

+ * + */ +public class AutomatonQuery extends MultiTermQuery { + protected Automaton automaton; + protected String field; + + /** + *

+ * Construct a new AutomatonQuery. + *

+ * Term is expected to contain regex syntax compatible with the BRICS package: + * http://www.brics.dk/automaton/ + *

+ *

+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *

+ * @param term Term containing field and regular expression + */ + public AutomatonQuery(Term term) { + this(term, new RegExp(term.text()).toAutomaton()); + } + + /** + *

+ * Construct a new AutomatonQuery, with a prebuilt automaton + *

+ * @param term Term containing field and some pattern structure + * @param automaton Automaton to use for query + */ + public AutomatonQuery(Term term, Automaton automaton) { + super(term); + this.field = term.field(); + this.automaton = automaton; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.MultiTermQuery#getEnum(org.apache.lucene.index.IndexReader) + */ + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new AutomatonTermEnum(automaton, field, reader); + } + + + /* (non-Javadoc) + * @see org.apache.lucene.search.MultiTermQuery#hashCode() + */ + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + ((automaton == null) ? 0 : automaton.hashCode()); + result = prime * result + ((field == null) ? 0 : field.hashCode()); + return result; + } + + + /* (non-Javadoc) + * @see org.apache.lucene.search.MultiTermQuery#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (!super.equals(obj)) + return false; + if (getClass() != obj.getClass()) + return false; + final AutomatonQuery other = (AutomatonQuery) obj; + if (automaton == null) { + if (other.automaton != null) + return false; + } else if (!automaton.equals(other.automaton)) + return false; + if (field == null) { + if (other.field != null) + return false; + } else if (!field.equals(other.field)) + return false; + return true; + } + +} Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonTermEnum.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonTermEnum.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonTermEnum.java (revision 0) @@ -0,0 +1,227 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FilteredTermEnum; + +import dk.brics.automaton.Automaton; +import dk.brics.automaton.RunAutomaton; +import dk.brics.automaton.State; +import dk.brics.automaton.Transition; + +/** + *

+ * A FilteredTermEnum that enumerates terms based upon what is accepted by a FSM. + *

+ *

+ * The algorithm is such:
+ * 1. As long as matches are successful, keep reading sequentially.
+ * 2. When a match fails, skip to the next string in lexicographic order that does not enter a reject state.
+ *

+ *

+ * The algorithm does not attempt to actually skip to the next string that is completely accepted. + * This is not possible when the language accepted by the FSM is not finite (i.e. * operator). + *

+ */ +class AutomatonTermEnum extends FilteredTermEnum { + private final IndexReader reader; + private final String field; + private final RunAutomaton runAutomaton; + private final Automaton automaton; + private Term lastTerm = null; + + /** + * Construct an enumerator based upon an automaton, enumerating the specified field, working on a supplied reader. + */ + AutomatonTermEnum(Automaton automaton, String field, IndexReader reader) throws IOException { + super(); + this.reader = reader; + this.field = field; + this.automaton = automaton; + + this.automaton.minimize(); /* minimize the automaton, just in case. this also ensures it is determinized. */ + runAutomaton = new RunAutomaton(automaton); /* tableize the automaton */ + + String startPoint = nextString(""); + if (startPoint == null) { // no terms match this automaton... oh well, it will figure this out. + startPoint = ""; + } + + lastTerm = new Term(field, startPoint); + setEnum(reader.terms(lastTerm)); + } + + //@Override + public float difference() { + return 1.0f; + } + + /** + * Returns true if the term matches the automaton. + * Also stashes away the term to assist with smart enumeration. + */ + //@Override + protected boolean termCompare(Term term) { + lastTerm = term; + return (term.field() == field && runAutomaton.run(term.text())); + } + + /** + * increments to the next term matching this automaton. + * after a successful comparison, it simply tries the next term. + * after an unsuccessful comparison, it seeks to a smarter position. + */ + //@Override + public boolean next() throws IOException { + + do { + if (lastTerm.equals(currentTerm)) { /* the last enumeration was a match, don't skip around */ + actualEnum.next(); + } else { /* seek to the next possible string */ + String nextPoint = nextString(lastTerm.text()); + if (nextPoint == null) { /* no more possible strings can match */ + currentTerm = null; + return false; + } + /* replace the old enumerator with a new one, positioned to a nice place */ + actualEnum.close(); + actualEnum = reader.terms(lastTerm.createTerm(nextPoint)); + } + + Term candidateTerm = actualEnum.term(); /* read a term */ + + /* this means end of enumeration: no more terms for this field or no more terms at all */ + if (candidateTerm == null || candidateTerm.field() != field) { + currentTerm = null; + return false; + } + + /* if the term matches the automaton, success! */ + if (termCompare(candidateTerm)) { + currentTerm = candidateTerm; + return true; + } + } while (true); + + } + + /** this is a dummy, it is not used by this class. */ + //@Override + protected boolean endEnum() { + assert false; // should never be called + return (currentTerm != null); + } + + + /** + * Returns the next String in lexicographic order after s that will not put the machine into a reject state. + * If such a string does not exist, returns null. + * + * The correctness of this method depends heavily upon the properties of a DFA. + * + * @param s input String + * @return next valid String + */ + + private final String nextString(String s) { + State state = automaton.getInitialState(); + int pos = 0; + + // walk the automaton until a character is rejected. + for (pos = 0; pos < s.length(); pos++) { + State nextState = state.step(s.charAt(pos)); + if (nextState == null) + break; + else + state = nextState; + } + + // take the useful portion, and the last non-reject state, and attempt to append characters that will match. + String nextString = nextString(s, state, pos); + if (nextString != null) { + return nextString; + } else { /* no more solutions exist from this useful portion, backtrack */ + if (pos == 0) /* all solutions exhausted */ + return null; + char nextChar = s.charAt(pos - 1); + nextChar++; + String sprime = s.substring(0, pos - 1) + nextChar; + if (runAutomaton.run(sprime)) /* if this is accepted it is good to go as-is */ + return sprime; + else + return nextString(sprime); + } + } + + /** + * Returns the next String in lexicographic order after s that will not put the machine into a reject state. + * Appends some characters to the useful portion. If this cannot satisfy the machine, returns null. + * This method will walk the minimal path, in lexicographic order, as long as possible. + * + * @param s input String + * @param state current non-reject state + * @param useful most useful portion of the string + * @return next valid String + */ + private final String nextString(String s, State state, int useful) { + // the next lexicographic character must be greater than the existing character, if it exists. + char c = 0; + if (useful < s.length()) { + c = s.charAt(useful); + c++; + } + + StringBuffer sb = new StringBuffer(); + // append the useful portion + sb.append(s.substring(0, useful)); + + Set visited = new HashSet(); + visited.add(state); + + Iterator transitions = state.getSortedTransitions(false).iterator(); + + // find the minimal path (lexicographic order) that is >= c + while (transitions.hasNext()) { + Transition transition = (Transition) transitions.next(); + if (transition.getMax() >= c) { + char nextChar = (char) Math.max(c, transition.getMin()); + sb.append(nextChar); + state = transition.getDest(); + // as long as is possible, continue down the minimal path. + // if a loop or accept state is encountered, stop. + while (!visited.contains(state) && !state.isAccept()) { + visited.add(state); + transition = (Transition) state.getSortedTransitions(false).get(0); + sb.append(transition.getMin()); + state = transition.getDest(); + } + return sb.toString(); + } + + } + return null; + } + +} Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java (revision 0) @@ -0,0 +1,99 @@ +package org.apache.lucene.search.regex; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryWrapperFilter; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; + +import dk.brics.automaton.Automaton; +import dk.brics.automaton.BasicAutomata; +import dk.brics.automaton.BasicOperations; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + *

+ * A Query that accepts the syntax of {@link WildcardQuery} but implements with an Automaton. + * This accelerates more wildcard cases. + * Most cases are accelerated such as ?(a|b)cd?e, but leading * is still slow. + *

+ */ +public class AutomatonWildcardQuery extends AutomatonQuery { + private final boolean termContainsWildcard; + + /** + * Construct a new AutomatonWildcardQuery + * @param term query Term + */ + public AutomatonWildcardQuery(Term term) { + super(term, toAutomaton(term)); + termContainsWildcard = (term.text().indexOf('*') != -1) || (term.text().indexOf('?') != -1); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) + */ + public Query rewrite(IndexReader reader) throws IOException { + if (!termContainsWildcard) { + Query simpleQuery = new TermQuery(getTerm()); + if (getConstantScoreRewrite()) + simpleQuery = new ConstantScoreQuery(new QueryWrapperFilter(simpleQuery)); + + simpleQuery.setBoost(getBoost()); + return simpleQuery; + } else { + return super.rewrite(reader); + } + } + + /** + * Convert lucene wildcard syntax into an automaton. + */ + private static Automaton toAutomaton(Term wildcardquery) { + List automata = new ArrayList(); + + String wildcardText = wildcardquery.text(); + + for (int i = 0; i < wildcardText.length(); i++) { + char c = wildcardText.charAt(i); + switch(c) { + case '*': { + automata.add(BasicAutomata.makeAnyString()); + break; + } + case '?': { + automata.add(BasicAutomata.makeAnyChar()); + break; + } + default: { + automata.add(BasicAutomata.makeChar(c)); + } + } + } + + return BasicOperations.concatenate(automata); + } + +} Index: contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonFuzzyQuery.java =================================================================== --- contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonFuzzyQuery.java (revision 0) +++ contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonFuzzyQuery.java (revision 0) @@ -0,0 +1,149 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.store.RAMDirectory; + +/** + * Tests {@link FuzzyQuery}. + * + */ +public class TestAutomatonFuzzyQuery extends LuceneTestCase { + + public void testFuzziness() throws Exception { + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + addDoc("aaaaa", writer); + addDoc("aaaab", writer); + addDoc("aaabb", writer); + addDoc("aabbb", writer); + addDoc("abbbb", writer); + addDoc("bbbbb", writer); + addDoc("ddddd", writer); + writer.optimize(); + writer.close(); + IndexSearcher searcher = new IndexSearcher(directory); + + AutomatonFuzzyQuery query = new AutomatonFuzzyQuery(new Term("field", "aaaaa")); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(2, hits.length); + + + // not similar enough: + query = new AutomatonFuzzyQuery(new Term("field", "xxxxx")); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + query = new AutomatonFuzzyQuery(new Term("field", "aaccc")); // edit distance to "aaaaa" = 3 + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + // query identical to a word in the index: + query = new AutomatonFuzzyQuery(new Term("field", "aaaaa")); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(2, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); + // default allows for up to two edits: + assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); + + // query similar to a word in the index: + query = new AutomatonFuzzyQuery(new Term("field", "aaaac")); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(2, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); + assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); + + query = new AutomatonFuzzyQuery(new Term("field", "ddddX")); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); + + + // different field = no match: + query = new AutomatonFuzzyQuery(new Term("anotherfield", "ddddX")); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + searcher.close(); + directory.close(); + } + + public void testFuzzinessLong() throws Exception { + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + addDoc("aaaaaaa", writer); + addDoc("segment", writer); + writer.optimize(); + writer.close(); + IndexSearcher searcher = new IndexSearcher(directory); + + AutomatonFuzzyQuery query; + // not similar enough: + query = new AutomatonFuzzyQuery(new Term("field", "xxxxx")); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + // "student" doesn't match anymore thanks to increased minimum similarity: + query = new AutomatonFuzzyQuery(new Term("field", "student")); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + searcher.close(); + directory.close(); + } + + public void testTransposition() throws Exception { + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + addDoc("aaaabbbb", writer); + addDoc("segment", writer); + writer.optimize(); + writer.close(); + IndexSearcher searcher = new IndexSearcher(directory); + + AutomatonFuzzyQuery query; + // transposition on + query = new AutomatonFuzzyQuery(new Term("field", "aaababbb")); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + + // transposition off + query = new AutomatonFuzzyQuery(new Term("field", "aaababbb"), false); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + searcher.close(); + directory.close(); + } + + private void addDoc(String text, IndexWriter writer) throws IOException { + Document doc = new Document(); + doc.add(new Field("field", text, Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + +} Index: contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java =================================================================== --- contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0) +++ contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0) @@ -0,0 +1,88 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.IndexSearcher; + +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; + +public class TestAutomatonQuery extends TestCase { + private IndexSearcher searcher; + private final String FN = "field"; + + public void setUp() { + RAMDirectory directory = new RAMDirectory(); + try { + IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field(FN, "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(directory); + } catch (Exception e) { + fail(e.toString()); + } + } + + public void tearDown() { + try { + searcher.close(); + } catch (Exception e) { + fail(e.toString()); + } + } + + private Term newTerm(String value) { return new Term(FN, value); } + + private int regexQueryNrHits(String regex) throws Exception { + AutomatonQuery query = new AutomatonQuery( newTerm(regex)); + return searcher.search(query).length(); + } + + public void testRegex1() throws Exception { + assertEquals(1, regexQueryNrHits("q.[aeiou]c.*")); + } + + public void testRegex2() throws Exception { + assertEquals(0, regexQueryNrHits(".[aeiou]c.*")); + } + + public void testRegex3() throws Exception { + assertEquals(0, regexQueryNrHits("q.[aeiou]c")); + } + + public void testEquals() throws Exception { + RegexQuery query1 = new RegexQuery( newTerm("foo.*")); + query1.setRegexImplementation(new JakartaRegexpCapabilities()); + + AutomatonQuery query2 = new AutomatonQuery( newTerm("foo.*")); + assertFalse(query1.equals(query2)); + } + +} + Index: contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcardQuery.java =================================================================== --- contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcardQuery.java (revision 0) +++ contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcardQuery.java (revision 0) @@ -0,0 +1,272 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; + +/** + * TestWildcard tests the '*' and '?' wildcard characters. + * + * @version $Id: TestWildcard.java 694004 2008-09-10 21:38:52Z mikemccand $ + * + */ +public class TestAutomatonWildcardQuery + extends LuceneTestCase { + public void testEquals() { + AutomatonWildcardQuery wq1 = new AutomatonWildcardQuery(new Term("field", "b*a")); + AutomatonWildcardQuery wq2 = new AutomatonWildcardQuery(new Term("field", "b*a")); + AutomatonWildcardQuery wq3 = new AutomatonWildcardQuery(new Term("field", "b*a")); + + // reflexive? + assertEquals(wq1, wq2); + assertEquals(wq2, wq1); + + // transitive? + assertEquals(wq2, wq3); + assertEquals(wq1, wq3); + + assertFalse(wq1.equals(null)); + + FuzzyQuery fq = new FuzzyQuery(new Term("field", "b*a")); + assertFalse(wq1.equals(fq)); + assertFalse(fq.equals(wq1)); + } + + /** + * Tests if a AutomatonWildcardQuery that has no wildcard in the term is rewritten to a single + * TermQuery. + */ + public void testTermWithoutWildcard() throws IOException { + RAMDirectory indexStore = getIndexStore("field", new String[]{"nowildcard", "nowildcardx"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + + Query wq = new AutomatonWildcardQuery(new Term("field", "nowildcard")); + assertMatches(searcher, wq, 1); + + wq = searcher.rewrite(wq); + assertTrue(wq instanceof TermQuery); + } + + /** + * Tests Wildcard queries with an asterisk. + */ + public void testAsterisk() + throws IOException { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + Query query1 = new TermQuery(new Term("body", "metal")); + Query query2 = new AutomatonWildcardQuery(new Term("body", "metal*")); + Query query3 = new AutomatonWildcardQuery(new Term("body", "m*tal")); + Query query4 = new AutomatonWildcardQuery(new Term("body", "m*tal*")); + Query query5 = new AutomatonWildcardQuery(new Term("body", "m*tals")); + + BooleanQuery query6 = new BooleanQuery(); + query6.add(query5, BooleanClause.Occur.SHOULD); + + BooleanQuery query7 = new BooleanQuery(); + query7.add(query3, BooleanClause.Occur.SHOULD); + query7.add(query5, BooleanClause.Occur.SHOULD); + + // Queries do not automatically lower-case search terms: + Query query8 = new AutomatonWildcardQuery(new Term("body", "M*tal*")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 2); + assertMatches(searcher, query3, 1); + assertMatches(searcher, query4, 2); + assertMatches(searcher, query5, 1); + assertMatches(searcher, query6, 1); + assertMatches(searcher, query7, 2); + assertMatches(searcher, query8, 0); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tall")), 0); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tal")), 1); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tal*")), 2); + } + + /** + * Tests Wildcard queries with a question mark. + * + * @throws IOException if an error occurs + */ + public void testQuestionmark() + throws IOException { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals", "mXtals", "mXtXls"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + Query query1 = new AutomatonWildcardQuery(new Term("body", "m?tal")); + Query query2 = new AutomatonWildcardQuery(new Term("body", "metal?")); + Query query3 = new AutomatonWildcardQuery(new Term("body", "metals?")); + Query query4 = new AutomatonWildcardQuery(new Term("body", "m?t?ls")); + Query query5 = new AutomatonWildcardQuery(new Term("body", "M?t?ls")); + Query query6 = new AutomatonWildcardQuery(new Term("body", "meta??")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 1); + assertMatches(searcher, query3, 0); + assertMatches(searcher, query4, 3); + assertMatches(searcher, query5, 0); + assertMatches(searcher, query6, 1); // Query: 'meta??' matches 'metals' not 'metal' + } + + private RAMDirectory getIndexStore(String field, String[] contents) + throws IOException { + RAMDirectory indexStore = new RAMDirectory(); + IndexWriter writer = new IndexWriter(indexStore, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + for (int i = 0; i < contents.length; ++i) { + Document doc = new Document(); + doc.add(new Field(field, contents[i], Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + writer.optimize(); + writer.close(); + + return indexStore; + } + + private void assertMatches(IndexSearcher searcher, Query q, int expectedMatches) + throws IOException { + ScoreDoc[] result = searcher.search(q, null, 1000).scoreDocs; + assertEquals(expectedMatches, result.length); + } + + /** + * Test that wild card queries are parsed to the correct type and are searched correctly. + * This test looks at both parsing and execution of wildcard queries. + * Although placed here, it also tests prefix queries, verifying that + * prefix queries are not parsed into wild card queries, and viceversa. + * @throws Exception + */ + public void testParsingAndSearching() throws Exception { + String field = "content"; + boolean dbg = false; + QueryParser qp = new QueryParser(field, new WhitespaceAnalyzer()) { + + @Override + protected Query newWildcardQuery(Term t) { + return new AutomatonWildcardQuery(t); + } + + }; + qp.setAllowLeadingWildcard(true); + String docs[] = { + "\\ abcdefg1", + "\\79 hijklmn1", + "\\\\ opqrstu1", + }; + // queries that should find all docs + String matchAll[] = { + "*", "*1", "**1", "*?", "*?1", "?*1", "**", "***", "\\\\*" + }; + // queries that should find no docs + String matchNone[] = { + "a*h", "a?h", "*a*h", "?a", "a?", + }; + // queries that should be parsed to prefix queries + String matchOneDocPrefix[][] = { + {"a*", "ab*", "abc*", }, // these should find only doc 0 + {"h*", "hi*", "hij*", "\\\\7*"}, // these should find only doc 1 + {"o*", "op*", "opq*", "\\\\\\\\*"}, // these should find only doc 2 + }; + // queries that should be parsed to wildcard queries + String matchOneDocWild[][] = { + {"*a*", "*ab*", "*abc**", "ab*e*", "*g?", "*f?1", "abc**"}, // these should find only doc 0 + {"*h*", "*hi*", "*hij**", "hi*k*", "*n?", "*m?1", "hij**"}, // these should find only doc 1 + {"*o*", "*op*", "*opq**", "op*q*", "*u?", "*t?1", "opq**"}, // these should find only doc 2 + }; + + // prepare the index + RAMDirectory dir = new RAMDirectory(); + IndexWriter iw = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + for (int i = 0; i < docs.length; i++) { + Document doc = new Document(); + doc.add(new Field(field,docs[i],Store.NO,Index.ANALYZED)); + iw.addDocument(doc); + } + iw.close(); + + IndexSearcher searcher = new IndexSearcher(dir); + + // test queries that must find all + for (int i = 0; i < matchAll.length; i++) { + String qtxt = matchAll[i]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("matchAll: qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(docs.length,hits.length); + } + + // test queries that must find none + for (int i = 0; i < matchNone.length; i++) { + String qtxt = matchNone[i]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("matchNone: qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(0,hits.length); + } + + // test queries that must be prefix queries and must find only one doc + for (int i = 0; i < matchOneDocPrefix.length; i++) { + for (int j = 0; j < matchOneDocPrefix[i].length; j++) { + String qtxt = matchOneDocPrefix[i][j]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("match 1 prefix: doc="+docs[i]+" qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + assertEquals(PrefixQuery.class, q.getClass()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(1,hits.length); + assertEquals(i,hits[0].doc); + } + } + + // test queries that must be wildcard queries and must find only one doc + for (int i = 0; i < matchOneDocPrefix.length; i++) { + for (int j = 0; j < matchOneDocWild[i].length; j++) { + String qtxt = matchOneDocWild[i][j]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("match 1 wild: doc="+docs[i]+" qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + assertEquals(AutomatonWildcardQuery.class, q.getClass()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(1,hits.length); + assertEquals(i,hits[0].doc); + } + } + + searcher.close(); + } + +}