Index: contrib/regex/build.xml =================================================================== --- contrib/regex/build.xml (revision 725886) +++ contrib/regex/build.xml (working copy) @@ -24,7 +24,7 @@ - + = Regents of the University of California + = University of California, Berkeley + = 1998 + +In the original BSD license, both occurrences of the phrase "COPYRIGHT HOLDERS AND CONTRIBUTORS" in the disclaimer read "REGENTS AND CONTRIBUTORS". + +Here is the license template: + +Copyright (c) , +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + * Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Index: contrib/regex/lib/automaton.jar =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Property changes on: contrib\regex\lib\automaton.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonFilter.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonFilter.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonFilter.java (revision 0) @@ -0,0 +1,248 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.Filter; +import org.apache.lucene.util.OpenBitSet; + +import dk.brics.automaton.Automaton; +import dk.brics.automaton.RegExp; +import dk.brics.automaton.RunAutomaton; + +/** + *

+ * A RegexpFilter that utilizes the BRICS automaton package: http://www.brics.dk/automaton/ + *

+ * + *

+ * The expression is converted to a DFA, and the state machine is used to optimize term enumeration. + *

+ * + *

+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *

+ * + */ + +public class AutomatonFilter extends Filter { + private final Term term; + private final Automaton automaton; + private static final int ACCEPTED = -2; + + /** + *

+ * Construct a new AutomatonFilter. + *

+ * Term is expected to contain regex syntax compatible with the BRICS package: + * http://www.brics.dk/automaton/ + *

+ *

+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *

+ * @param term Term containing field and regular expression + */ + public AutomatonFilter(Term term) { + this(term, new RegExp(term.text()).toAutomaton()); + } + + /** + *

+ * Construct a new Automaton Filter + *

+ * + * @param field Lucene field to query + * @param automaton Brics Automaton + */ + public AutomatonFilter(String field, Automaton automaton) { + this(new Term(field), automaton); + } + + /** + *

+ * Construct a new Automaton Filter + *

+ * + * @param term Term for the Lucene field to query + * @param automaton Brics automaton + */ + protected AutomatonFilter(Term term, Automaton automaton) { + super(); + this.term = term; + this.automaton = automaton; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader) + */ + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + /* + * The algorithm here is pretty basic. Enumerate terms but instead of a binary accept/reject do: + * + * Look at the portion that is OK (did not enter a reject state in the DFA) + * Generate the next possible String and seek to that. + * + * Because this implementation is bounded by alphabet size, it could be slightly improved. + * One possibility is the use of Automaton State/Transition classes which provide character intervals. + * For large numbers of CJK terms where the "alphabet" is large, this might optimize things a bit better. + * + */ + OpenBitSet bits = new OpenBitSet(reader.maxDoc()); + String prefix = automaton.getCommonPrefix(); + RunAutomaton runAutomaton = new RunAutomaton(automaton); + + /* if there is a static prefix, why not start here, but probably not truly necessary */ + TermEnum enumerator = reader.terms(term.createTerm(prefix)); + TermDocs termDocs = reader.termDocs(); + + Term t = null; + while ((t = enumerator.term()) != null) { + + if (t.field() != term.field()) { /* wrong field, enumerated all the terms we need */ + break; + } + + String termText = t.text(); + + if (!termText.startsWith(prefix)) { /* we are done. this helps for a large alphabet */ + break; + } + + /* run the string against the automaton. + * Either the string is accepted, or it is rejected. + * When rejected, the acceptStatus contains the highest index that DID NOT go into a reject state. + */ + int acceptStatus = run(runAutomaton, termText); + if (acceptStatus == ACCEPTED) { + /* in this case, add all the docs and keep enumerating + */ + termDocs.seek(t); + while (termDocs.next()) + bits.set(termDocs.doc()); + enumerator.next(); + } else { + /* in this case, want to take the portion that wasn't rejected, and generate the next possible unicode string. + * instruct TermEnum to seek to that location. + */ + enumerator.close(); + Term next = term.createTerm(nextString(termText, acceptStatus)); + enumerator = reader.terms(next); + } + } + enumerator.close(); + termDocs.close(); + return bits; + } + + /** + * Helper function to generate the next possible Unicode String + * @param termText String value of term + * @param acceptStatus max character position that did not enter into a reject state + * @return next possible unicode String + */ + + private static final String nextString(String termText, int acceptStatus) { + if (termText.length() == 0) /* empty string */ + return "\u0000"; + + int boundary = acceptStatus + 1; + + StringBuffer prefix = new StringBuffer(); + prefix.append(termText.substring(0, boundary)); + int nextChar = -1; + if (boundary < termText.length()) + nextChar = termText.charAt(boundary); + /* U+FFFF is guaranteed not to ever be a valid unicode character so no overflow risk here */ + prefix.append((char)(nextChar + 1)); + + return prefix.toString(); + } + + + /** + *

+ * Return the character position of the longest portion that doesn't enter a reject state. + * This method returns one of three values + * ACCEPTED (-2): This means this is a match, it ends in an accept state. + * -1: the first character entered a reject state, therefore no characters are "useful" + * n: where n is the position of the longest portion that did not enter a reject state. + *

+ * @param ra RunAutomaton + * @param s String + * @return index of the longest portion that doesn't enter reject state, or ACCEPTED, or -1 + */ + private static final int run(RunAutomaton ra, String s) { + int state = ra.getInitialState(); + int length = s.length(); + int max = -1; + for (int offset = 0; offset < length; offset++) { + state = ra.step(state, s.charAt(offset)); + if (state == -1) + break; + else + max++; + } + + if (state >= 0 && ra.isAccept(state)) + return ACCEPTED; + + return max; + } + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((automaton == null) ? 0 : automaton.hashCode()); + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final AutomatonFilter other = (AutomatonFilter) obj; + if (automaton == null) { + if (other.automaton != null) + return false; + } else if (!automaton.equals(other.automaton)) + return false; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } + +} Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0) @@ -0,0 +1,115 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.ToStringUtils; + +/** + *

+ * A RegexpQuery that utilizes the BRICS automaton package: http://www.brics.dk/automaton/ + *

+ * + *

+ * Queries are converted to a DFA, and the state machine is used to optimize term enumeration. + *

+ * Score is constant and equal to the boost. + *

+ * + *

+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *

+ * + */ +public class AutomatonQuery extends Query { + protected Term term; + + /** + *

+ * Construct a new AutomatonQuery. + *

+ * Term is expected to contain regex syntax compatible with the BRICS package: + * http://www.brics.dk/automaton/ + *

+ *

+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *

+ * @param term Term containing field and regular expression + */ + public AutomatonQuery(Term term) { + this.term = term; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) + */ + public Query rewrite(IndexReader reader) throws IOException { + Query query = new ConstantScoreQuery(new AutomatonFilter(term)); + query.setBoost(getBoost()); + return query; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#toString(java.lang.String) + */ + public String toString(String field) { + StringBuffer buffer = new StringBuffer(); + buffer.append("automatonQuery("); + buffer.append(term); + buffer.append(")"); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } + + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final AutomatonQuery other = (AutomatonQuery) obj; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } + +} Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java (revision 0) @@ -0,0 +1,133 @@ +package org.apache.lucene.search.regex; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util.ToStringUtils; + +import dk.brics.automaton.Automaton; +import dk.brics.automaton.BasicAutomata; +import dk.brics.automaton.BasicOperations; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + *

+ * A Query that accepts the syntax of {@link WildcardQuery} but implements with an Automaton. + * This accelerates more wildcard cases. + * Most cases are accelerated such as ?(a|b)cd?e, but leading * is still slow. + *

+ *

+ * Score is constant and equal to the boost + *

+ */ +public class AutomatonWildcardQuery extends Query { + protected Term term; + private boolean termContainsWildcard; + + /** + * Construct a new AutomatonWildcardQuery + * @param term query Term + */ + public AutomatonWildcardQuery(Term term) { + this.term = term; + this.termContainsWildcard = (term.text().indexOf('*') != -1) || (term.text().indexOf('?') != -1); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) + */ + public Query rewrite(IndexReader reader) throws IOException { + if (!termContainsWildcard) + return new TermQuery(term); + + List automata = new ArrayList(); + + String wildcardText = term.text(); + + for (int i = 0; i < wildcardText.length(); i++) { + char c = wildcardText.charAt(i); + switch(c) { + case '*': { + automata.add(BasicAutomata.makeAnyString()); + break; + } + case '?': { + automata.add(BasicAutomata.makeAnyChar()); + break; + } + default: { + automata.add(BasicAutomata.makeChar(c)); + } + } + } + + Automaton combined = BasicOperations.concatenate(automata); + Query query = new ConstantScoreQuery(new AutomatonFilter(term.field(), combined)); + query.setBoost(getBoost()); + return query; + } + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final AutomatonWildcardQuery other = (AutomatonWildcardQuery) obj; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#toString(java.lang.String) + */ + public String toString(String field) { + StringBuffer buffer = new StringBuffer(); + buffer.append("automatonWildcardQuery("); + buffer.append(term); + buffer.append(")"); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } +} Index: contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java =================================================================== --- contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0) +++ contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0) @@ -0,0 +1,87 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.RAMDirectory; + +public class TestAutomatonQuery extends TestCase { + private IndexSearcher searcher; + private final String FN = "field"; + + public void setUp() { + RAMDirectory directory = new RAMDirectory(); + try { + IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field(FN, "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(directory); + } catch (Exception e) { + fail(e.toString()); + } + } + + public void tearDown() { + try { + searcher.close(); + } catch (Exception e) { + fail(e.toString()); + } + } + + private Term newTerm(String value) { return new Term(FN, value); } + + private int regexQueryNrHits(String regex) throws Exception { + AutomatonQuery query = new AutomatonQuery( newTerm(regex)); + return searcher.search(query).length(); + } + + + public void testRegex1() throws Exception { + assertEquals(1, regexQueryNrHits("q.[aeiou]c.*")); + } + + public void testRegex2() throws Exception { + assertEquals(0, regexQueryNrHits(".[aeiou]c.*")); + } + + public void testRegex3() throws Exception { + assertEquals(0, regexQueryNrHits("q.[aeiou]c")); + } + + + public void testEquals() throws Exception { + RegexQuery query1 = new RegexQuery( newTerm("foo.*")); + + RegexQuery query2 = new RegexQuery( newTerm("foo.*")); + assertEquals(query1.equals(query2), true); + } + +} + Index: contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcard.java =================================================================== --- contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcard.java (revision 0) +++ contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcard.java (revision 0) @@ -0,0 +1,272 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * TestAutomatonWildcard tests the '*' and '?' wildcard characters. + * + * shamelessly ripped from TestWildcaard + * + */ +public class TestAutomatonWildcard + extends LuceneTestCase { + public void testEquals() { + AutomatonWildcardQuery wq1 = new AutomatonWildcardQuery(new Term("field", "b*a")); + AutomatonWildcardQuery wq2 = new AutomatonWildcardQuery(new Term("field", "b*a")); + AutomatonWildcardQuery wq3 = new AutomatonWildcardQuery(new Term("field", "b*a")); + + // reflexive? + assertEquals(wq1, wq2); + assertEquals(wq2, wq1); + + // transitive? + assertEquals(wq2, wq3); + assertEquals(wq1, wq3); + + assertFalse(wq1.equals(null)); + + FuzzyQuery fq = new FuzzyQuery(new Term("field", "b*a")); + assertFalse(wq1.equals(fq)); + assertFalse(fq.equals(wq1)); + } + + /** + * Tests if a WildcardQuery that has no wildcard in the term is rewritten to a single + * TermQuery. + */ + public void testTermWithoutWildcard() throws IOException { + RAMDirectory indexStore = getIndexStore("field", new String[]{"nowildcard", "nowildcardx"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + + Query wq = new AutomatonWildcardQuery(new Term("field", "nowildcard")); + assertMatches(searcher, wq, 1); + + wq = searcher.rewrite(wq); + assertTrue(wq instanceof TermQuery); + } + + /** + * Tests Wildcard queries with an asterisk. + */ + public void testAsterisk() + throws IOException { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + Query query1 = new TermQuery(new Term("body", "metal")); + Query query2 = new AutomatonWildcardQuery(new Term("body", "metal*")); + Query query3 = new AutomatonWildcardQuery(new Term("body", "m*tal")); + Query query4 = new AutomatonWildcardQuery(new Term("body", "m*tal*")); + Query query5 = new AutomatonWildcardQuery(new Term("body", "m*tals")); + + BooleanQuery query6 = new BooleanQuery(); + query6.add(query5, BooleanClause.Occur.SHOULD); + + BooleanQuery query7 = new BooleanQuery(); + query7.add(query3, BooleanClause.Occur.SHOULD); + query7.add(query5, BooleanClause.Occur.SHOULD); + + // Queries do not automatically lower-case search terms: + Query query8 = new AutomatonWildcardQuery(new Term("body", "M*tal*")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 2); + assertMatches(searcher, query3, 1); + assertMatches(searcher, query4, 2); + assertMatches(searcher, query5, 1); + assertMatches(searcher, query6, 1); + assertMatches(searcher, query7, 2); + assertMatches(searcher, query8, 0); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tall")), 0); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tal")), 1); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tal*")), 2); + } + + /** + * Tests Wildcard queries with a question mark. + * + * @throws IOException if an error occurs + */ + public void testQuestionmark() + throws IOException { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals", "mXtals", "mXtXls"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + Query query1 = new AutomatonWildcardQuery(new Term("body", "m?tal")); + Query query2 = new AutomatonWildcardQuery(new Term("body", "metal?")); + Query query3 = new AutomatonWildcardQuery(new Term("body", "metals?")); + Query query4 = new AutomatonWildcardQuery(new Term("body", "m?t?ls")); + Query query5 = new AutomatonWildcardQuery(new Term("body", "M?t?ls")); + Query query6 = new AutomatonWildcardQuery(new Term("body", "meta??")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 1); + assertMatches(searcher, query3, 0); + assertMatches(searcher, query4, 3); + assertMatches(searcher, query5, 0); + assertMatches(searcher, query6, 1); // Query: 'meta??' matches 'metals' not 'metal' + } + + private RAMDirectory getIndexStore(String field, String[] contents) + throws IOException { + RAMDirectory indexStore = new RAMDirectory(); + IndexWriter writer = new IndexWriter(indexStore, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + for (int i = 0; i < contents.length; ++i) { + Document doc = new Document(); + doc.add(new Field(field, contents[i], Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + writer.optimize(); + writer.close(); + + return indexStore; + } + + private void assertMatches(IndexSearcher searcher, Query q, int expectedMatches) + throws IOException { + ScoreDoc[] result = searcher.search(q, null, 1000).scoreDocs; + assertEquals(expectedMatches, result.length); + } + + /** + * Test that wild card queries are parsed to the correct type and are searched correctly. + * This test looks at both parsing and execution of wildcard queries. + * Although placed here, it also tests prefix queries, verifying that + * prefix queries are not parsed into wild card queries, and viceversa. + * @throws Exception + */ + public void testParsingAndSearching() throws Exception { + String field = "content"; + boolean dbg = false; + QueryParser qp = new QueryParser(field, new WhitespaceAnalyzer()) { + + protected Query newWildcardQuery(Term t) { + return new AutomatonWildcardQuery(t); + } + + }; + + qp.setAllowLeadingWildcard(true); + String docs[] = { + "\\ abcdefg1", + "\\79 hijklmn1", + "\\\\ opqrstu1", + }; + // queries that should find all docs + String matchAll[] = { + "*", "*1", "**1", "*?", "*?1", "?*1", "**", "***", "\\\\*" + }; + // queries that should find no docs + String matchNone[] = { + "a*h", "a?h", "*a*h", "?a", "a?", + }; + // queries that should be parsed to prefix queries + String matchOneDocPrefix[][] = { + {"a*", "ab*", "abc*", }, // these should find only doc 0 + {"h*", "hi*", "hij*", "\\\\7*"}, // these should find only doc 1 + {"o*", "op*", "opq*", "\\\\\\\\*"}, // these should find only doc 2 + }; + // queries that should be parsed to wildcard queries + String matchOneDocWild[][] = { + {"*a*", "*ab*", "*abc**", "ab*e*", "*g?", "*f?1", "abc**"}, // these should find only doc 0 + {"*h*", "*hi*", "*hij**", "hi*k*", "*n?", "*m?1", "hij**"}, // these should find only doc 1 + {"*o*", "*op*", "*opq**", "op*q*", "*u?", "*t?1", "opq**"}, // these should find only doc 2 + }; + + // prepare the index + RAMDirectory dir = new RAMDirectory(); + IndexWriter iw = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + for (int i = 0; i < docs.length; i++) { + Document doc = new Document(); + doc.add(new Field(field,docs[i],Store.NO,Index.ANALYZED)); + iw.addDocument(doc); + } + iw.close(); + + IndexSearcher searcher = new IndexSearcher(dir); + + // test queries that must find all + for (int i = 0; i < matchAll.length; i++) { + String qtxt = matchAll[i]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("matchAll: qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(docs.length,hits.length); + } + + // test queries that must find none + for (int i = 0; i < matchNone.length; i++) { + String qtxt = matchNone[i]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("matchNone: qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(0,hits.length); + } + + // test queries that must be prefix queries and must find only one doc + for (int i = 0; i < matchOneDocPrefix.length; i++) { + for (int j = 0; j < matchOneDocPrefix[i].length; j++) { + String qtxt = matchOneDocPrefix[i][j]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("match 1 prefix: doc="+docs[i]+" qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + assertEquals(PrefixQuery.class, q.getClass()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(1,hits.length); + assertEquals(i,hits[0].doc); + } + } + + // test queries that must be wildcard queries and must find only one doc + for (int i = 0; i < matchOneDocPrefix.length; i++) { + for (int j = 0; j < matchOneDocWild[i].length; j++) { + String qtxt = matchOneDocWild[i][j]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("match 1 wild: doc="+docs[i]+" qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + assertEquals(AutomatonWildcardQuery.class, q.getClass()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(1,hits.length); + assertEquals(i,hits[0].doc); + } + } + + searcher.close(); + } + +}