Index: contrib/regex/build.xml =================================================================== --- contrib/regex/build.xml (revision 765635) +++ contrib/regex/build.xml (working copy) @@ -24,7 +24,7 @@ - + = Regents of the University of California + = University of California, Berkeley + = 1998 + +In the original BSD license, both occurrences of the phrase "COPYRIGHT HOLDERS AND CONTRIBUTORS" in the disclaimer read "REGENTS AND CONTRIBUTORS". + +Here is the license template: + +Copyright (c) , +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + * Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Index: contrib/regex/lib/automaton.jar =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Property changes on: contrib\regex\lib\automaton.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0) @@ -0,0 +1,123 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FilteredTermEnum; +import org.apache.lucene.search.MultiTermQuery; + +import dk.brics.automaton.Automaton; +import dk.brics.automaton.RegExp; + +/** + *

+ * A RegexpQuery that utilizes the BRICS automaton package: http://www.brics.dk/automaton/ + *

+ * + *

+ * Queries are converted to a DFA, and the state machine is used to optimize term enumeration. + *

+ * + *

+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *

+ * + */ +public class AutomatonQuery extends MultiTermQuery { + protected Automaton automaton; + protected String field; + + /** + *

+ * Construct a new AutomatonQuery. + *

+ * Term is expected to contain regex syntax compatible with the BRICS package: + * http://www.brics.dk/automaton/ + *

+ *

+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *

+ * @param term Term containing field and regular expression + */ + public AutomatonQuery(Term term) { + this(term, new RegExp(term.text()).toAutomaton()); + } + + /** + *

+ * Construct a new AutomatonQuery, with a prebuilt automaton + *

+ * @param term Term containing field and some pattern structure + * @param automaton Automaton to use for query + */ + public AutomatonQuery(Term term, Automaton automaton) { + super(term); + this.field = term.field(); + this.automaton = automaton; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.MultiTermQuery#getEnum(org.apache.lucene.index.IndexReader) + */ + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new AutomatonTermEnum(automaton, field, reader); + } + + + /* (non-Javadoc) + * @see org.apache.lucene.search.MultiTermQuery#hashCode() + */ + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + ((automaton == null) ? 0 : automaton.hashCode()); + result = prime * result + ((field == null) ? 0 : field.hashCode()); + return result; + } + + + /* (non-Javadoc) + * @see org.apache.lucene.search.MultiTermQuery#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (!super.equals(obj)) + return false; + if (getClass() != obj.getClass()) + return false; + final AutomatonQuery other = (AutomatonQuery) obj; + if (automaton == null) { + if (other.automaton != null) + return false; + } else if (!automaton.equals(other.automaton)) + return false; + if (field == null) { + if (other.field != null) + return false; + } else if (!field.equals(other.field)) + return false; + return true; + } + +} Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonTermEnum.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonTermEnum.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonTermEnum.java (revision 0) @@ -0,0 +1,194 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FilteredTermEnum; + +import dk.brics.automaton.Automaton; +import dk.brics.automaton.RunAutomaton; + +/** + *

+ * A FilteredTermEnum that enumerates terms based upon what is accepted by a FSM. + *

+ *

+ * The algorithm is such:
+ * 1. As long as matches are successful, keep reading sequentially.
+ * 2. When a match fails, skip to the next string in lexicographic order that does not enter a reject state.
+ *

+ *

+ * The algorithm does not attempt to actually skip to the next string that is completely accepted. + * This is not possible when the language accepted by the FSM is not finite (i.e. * operator). + *

+ */ +final class AutomatonTermEnum extends FilteredTermEnum { + private final IndexReader reader; + private final String field; + private final RunAutomaton runAutomaton; + private Term lastTerm = null; + + /** + * Construct an enumerator based upon an automaton, enumerating the specified field, working on a supplied reader. + */ + AutomatonTermEnum(Automaton automaton, String field, IndexReader reader) throws IOException { + super(); + this.reader = reader; + this.field = field; + + automaton.minimize(); /* minimize the automaton, just in case */ + runAutomaton = new RunAutomaton(automaton); /* tableize the automaton */ + + String startPoint = automaton.getCommonPrefix(); /* if there is a common regexp prefix, start there */ + if (startPoint.length() == 0) /* no prefix, start at the first reasonable spot */ + startPoint = nextString(""); + if (startPoint == null) { // no terms match this automaton... oh well, it will figure this out. + startPoint = ""; + } + + lastTerm = new Term(field, startPoint); + setEnum(reader.terms(lastTerm)); + } + + //@Override + public float difference() { + return 1.0f; + } + + /** + * Returns true if the term matches the automaton. + * Also stashes away the term to assist with smart enumeration. + */ + //@Override + protected boolean termCompare(Term term) { + lastTerm = term; + return (term.field() == field && runAutomaton.run(term.text())); + } + + /** + * increments to the next term matching this automaton. + * after a successful comparison, it simply tries the next term. + * after an unsuccessful comparison, it seeks to a smarter position. + */ + //@Override + public boolean next() throws IOException { + + do { + if (lastTerm.equals(currentTerm)) { /* the last enumeration was a match, don't skip around */ + actualEnum.next(); + } else { /* seek to the next possible string */ + String nextPoint = nextString(lastTerm.text()); + if (nextPoint == null) { /* no more possible strings can match */ + currentTerm = null; + return false; + } + /* replace the old enumerator with a new one, positioned to a nice place */ + actualEnum.close(); + actualEnum = reader.terms(new Term(field, nextPoint)); + } + + Term candidateTerm = actualEnum.term(); /* read a term */ + + /* this means end of enumeration: no more terms for this field or no more terms at all */ + if (candidateTerm == null || candidateTerm.field() != field) { + currentTerm = null; + return false; + } + + /* if the term matches the automaton, success! */ + if (termCompare(candidateTerm)) { + currentTerm = candidateTerm; + return true; + } + } while (true); + + } + + /** this is a dummy, it is not used by this class. */ + //@Override + protected boolean endEnum() { + assert false; // should never be called + return (currentTerm != null); + } + + + /** + * Returns the next String in lexicographic order after s that will not put the machine into a reject state. + * If such a string does not exist, returns null. + * @param s input String + * @return next valid String + */ + private final String nextString(String s) { + int state = runAutomaton.getInitialState(); + int pos = 0; + + for (pos = 0; pos < s.length(); pos++) { + int nextState = runAutomaton.step(state, s.charAt(pos)); + if (nextState == -1) + break; + else + state = nextState; + } + + String nextString = nextString(s, state, pos); + if (nextString != null) { + return nextString; + } else { /* no more solutions exist from this useful portion, backtrack */ + if (pos == 0) /* all solutions exhausted */ + return null; + char nextChar = s.charAt(pos - 1); + nextChar++; + String sprime = s.substring(0, pos - 1) + nextChar; + if (runAutomaton.run(sprime)) /* if this is accepted it is good to go */ + return sprime; + else + return nextString(sprime); + } + } + + /** + * Returns the next String in lexicographic order after s that will not put the machine into a reject state. + * Appends a character to the useful portion. If this cannot satisfy the machine, returns null. + * + * @param s input String + * @param state current non-reject state + * @param useful most useful portion of the string + * @return next valid String + */ + private final String nextString(String s, int state, int useful) { + char c = 0; + boolean found = false; + if (useful < s.length()) + c = s.charAt(useful); + + for (; c < 0xffff; c++) + if (runAutomaton.step(state, c) != -1) { + found = true; + break; + } + + if (!found) + return null; + + return s.substring(0, useful) + c; + } + +} Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java (revision 0) @@ -0,0 +1,99 @@ +package org.apache.lucene.search.regex; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryWrapperFilter; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; + +import dk.brics.automaton.Automaton; +import dk.brics.automaton.BasicAutomata; +import dk.brics.automaton.BasicOperations; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + *

+ * A Query that accepts the syntax of {@link WildcardQuery} but implements with an Automaton. + * This accelerates more wildcard cases. + * Most cases are accelerated such as ?(a|b)cd?e, but leading * is still slow. + *

+ */ +public class AutomatonWildcardQuery extends AutomatonQuery { + private final boolean termContainsWildcard; + + /** + * Construct a new AutomatonWildcardQuery + * @param term query Term + */ + public AutomatonWildcardQuery(Term term) { + super(term, toAutomaton(term)); + termContainsWildcard = (term.text().indexOf('*') != -1) || (term.text().indexOf('?') != -1); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) + */ + public Query rewrite(IndexReader reader) throws IOException { + if (!termContainsWildcard) { + Query simpleQuery = new TermQuery(getTerm()); + if (getConstantScoreRewrite()) + simpleQuery = new ConstantScoreQuery(new QueryWrapperFilter(simpleQuery)); + + simpleQuery.setBoost(getBoost()); + return simpleQuery; + } else { + return super.rewrite(reader); + } + } + + /** + * Convert lucene wildcard syntax into an automaton. + */ + private static Automaton toAutomaton(Term wildcardquery) { + List automata = new ArrayList(); + + String wildcardText = wildcardquery.text(); + + for (int i = 0; i < wildcardText.length(); i++) { + char c = wildcardText.charAt(i); + switch(c) { + case '*': { + automata.add(BasicAutomata.makeAnyString()); + break; + } + case '?': { + automata.add(BasicAutomata.makeAnyChar()); + break; + } + default: { + automata.add(BasicAutomata.makeChar(c)); + } + } + } + + return BasicOperations.concatenate(automata); + } + +} Index: contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java =================================================================== --- contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0) +++ contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0) @@ -0,0 +1,88 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.IndexSearcher; + +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; + +public class TestAutomatonQuery extends TestCase { + private IndexSearcher searcher; + private final String FN = "field"; + + public void setUp() { + RAMDirectory directory = new RAMDirectory(); + try { + IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field(FN, "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(directory); + } catch (Exception e) { + fail(e.toString()); + } + } + + public void tearDown() { + try { + searcher.close(); + } catch (Exception e) { + fail(e.toString()); + } + } + + private Term newTerm(String value) { return new Term(FN, value); } + + private int regexQueryNrHits(String regex) throws Exception { + AutomatonQuery query = new AutomatonQuery( newTerm(regex)); + return searcher.search(query).length(); + } + + public void testRegex1() throws Exception { + assertEquals(1, regexQueryNrHits("q.[aeiou]c.*")); + } + + public void testRegex2() throws Exception { + assertEquals(0, regexQueryNrHits(".[aeiou]c.*")); + } + + public void testRegex3() throws Exception { + assertEquals(0, regexQueryNrHits("q.[aeiou]c")); + } + + public void testEquals() throws Exception { + RegexQuery query1 = new RegexQuery( newTerm("foo.*")); + query1.setRegexImplementation(new JakartaRegexpCapabilities()); + + AutomatonQuery query2 = new AutomatonQuery( newTerm("foo.*")); + assertFalse(query1.equals(query2)); + } + +} + Index: contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcardQuery.java =================================================================== --- contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcardQuery.java (revision 0) +++ contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcardQuery.java (revision 0) @@ -0,0 +1,272 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; + +/** + * TestWildcard tests the '*' and '?' wildcard characters. + * + * @version $Id: TestWildcard.java 694004 2008-09-10 21:38:52Z mikemccand $ + * + */ +public class TestAutomatonWildcardQuery + extends LuceneTestCase { + public void testEquals() { + AutomatonWildcardQuery wq1 = new AutomatonWildcardQuery(new Term("field", "b*a")); + AutomatonWildcardQuery wq2 = new AutomatonWildcardQuery(new Term("field", "b*a")); + AutomatonWildcardQuery wq3 = new AutomatonWildcardQuery(new Term("field", "b*a")); + + // reflexive? + assertEquals(wq1, wq2); + assertEquals(wq2, wq1); + + // transitive? + assertEquals(wq2, wq3); + assertEquals(wq1, wq3); + + assertFalse(wq1.equals(null)); + + FuzzyQuery fq = new FuzzyQuery(new Term("field", "b*a")); + assertFalse(wq1.equals(fq)); + assertFalse(fq.equals(wq1)); + } + + /** + * Tests if a AutomatonWildcardQuery that has no wildcard in the term is rewritten to a single + * TermQuery. + */ + public void testTermWithoutWildcard() throws IOException { + RAMDirectory indexStore = getIndexStore("field", new String[]{"nowildcard", "nowildcardx"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + + Query wq = new AutomatonWildcardQuery(new Term("field", "nowildcard")); + assertMatches(searcher, wq, 1); + + wq = searcher.rewrite(wq); + assertTrue(wq instanceof TermQuery); + } + + /** + * Tests Wildcard queries with an asterisk. + */ + public void testAsterisk() + throws IOException { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + Query query1 = new TermQuery(new Term("body", "metal")); + Query query2 = new AutomatonWildcardQuery(new Term("body", "metal*")); + Query query3 = new AutomatonWildcardQuery(new Term("body", "m*tal")); + Query query4 = new AutomatonWildcardQuery(new Term("body", "m*tal*")); + Query query5 = new AutomatonWildcardQuery(new Term("body", "m*tals")); + + BooleanQuery query6 = new BooleanQuery(); + query6.add(query5, BooleanClause.Occur.SHOULD); + + BooleanQuery query7 = new BooleanQuery(); + query7.add(query3, BooleanClause.Occur.SHOULD); + query7.add(query5, BooleanClause.Occur.SHOULD); + + // Queries do not automatically lower-case search terms: + Query query8 = new AutomatonWildcardQuery(new Term("body", "M*tal*")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 2); + assertMatches(searcher, query3, 1); + assertMatches(searcher, query4, 2); + assertMatches(searcher, query5, 1); + assertMatches(searcher, query6, 1); + assertMatches(searcher, query7, 2); + assertMatches(searcher, query8, 0); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tall")), 0); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tal")), 1); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tal*")), 2); + } + + /** + * Tests Wildcard queries with a question mark. + * + * @throws IOException if an error occurs + */ + public void testQuestionmark() + throws IOException { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals", "mXtals", "mXtXls"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + Query query1 = new AutomatonWildcardQuery(new Term("body", "m?tal")); + Query query2 = new AutomatonWildcardQuery(new Term("body", "metal?")); + Query query3 = new AutomatonWildcardQuery(new Term("body", "metals?")); + Query query4 = new AutomatonWildcardQuery(new Term("body", "m?t?ls")); + Query query5 = new AutomatonWildcardQuery(new Term("body", "M?t?ls")); + Query query6 = new AutomatonWildcardQuery(new Term("body", "meta??")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 1); + assertMatches(searcher, query3, 0); + assertMatches(searcher, query4, 3); + assertMatches(searcher, query5, 0); + assertMatches(searcher, query6, 1); // Query: 'meta??' matches 'metals' not 'metal' + } + + private RAMDirectory getIndexStore(String field, String[] contents) + throws IOException { + RAMDirectory indexStore = new RAMDirectory(); + IndexWriter writer = new IndexWriter(indexStore, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + for (int i = 0; i < contents.length; ++i) { + Document doc = new Document(); + doc.add(new Field(field, contents[i], Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + writer.optimize(); + writer.close(); + + return indexStore; + } + + private void assertMatches(IndexSearcher searcher, Query q, int expectedMatches) + throws IOException { + ScoreDoc[] result = searcher.search(q, null, 1000).scoreDocs; + assertEquals(expectedMatches, result.length); + } + + /** + * Test that wild card queries are parsed to the correct type and are searched correctly. + * This test looks at both parsing and execution of wildcard queries. + * Although placed here, it also tests prefix queries, verifying that + * prefix queries are not parsed into wild card queries, and viceversa. + * @throws Exception + */ + public void testParsingAndSearching() throws Exception { + String field = "content"; + boolean dbg = false; + QueryParser qp = new QueryParser(field, new WhitespaceAnalyzer()) { + + @Override + protected Query newWildcardQuery(Term t) { + return new AutomatonWildcardQuery(t); + } + + }; + qp.setAllowLeadingWildcard(true); + String docs[] = { + "\\ abcdefg1", + "\\79 hijklmn1", + "\\\\ opqrstu1", + }; + // queries that should find all docs + String matchAll[] = { + "*", "*1", "**1", "*?", "*?1", "?*1", "**", "***", "\\\\*" + }; + // queries that should find no docs + String matchNone[] = { + "a*h", "a?h", "*a*h", "?a", "a?", + }; + // queries that should be parsed to prefix queries + String matchOneDocPrefix[][] = { + {"a*", "ab*", "abc*", }, // these should find only doc 0 + {"h*", "hi*", "hij*", "\\\\7*"}, // these should find only doc 1 + {"o*", "op*", "opq*", "\\\\\\\\*"}, // these should find only doc 2 + }; + // queries that should be parsed to wildcard queries + String matchOneDocWild[][] = { + {"*a*", "*ab*", "*abc**", "ab*e*", "*g?", "*f?1", "abc**"}, // these should find only doc 0 + {"*h*", "*hi*", "*hij**", "hi*k*", "*n?", "*m?1", "hij**"}, // these should find only doc 1 + {"*o*", "*op*", "*opq**", "op*q*", "*u?", "*t?1", "opq**"}, // these should find only doc 2 + }; + + // prepare the index + RAMDirectory dir = new RAMDirectory(); + IndexWriter iw = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + for (int i = 0; i < docs.length; i++) { + Document doc = new Document(); + doc.add(new Field(field,docs[i],Store.NO,Index.ANALYZED)); + iw.addDocument(doc); + } + iw.close(); + + IndexSearcher searcher = new IndexSearcher(dir); + + // test queries that must find all + for (int i = 0; i < matchAll.length; i++) { + String qtxt = matchAll[i]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("matchAll: qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(docs.length,hits.length); + } + + // test queries that must find none + for (int i = 0; i < matchNone.length; i++) { + String qtxt = matchNone[i]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("matchNone: qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(0,hits.length); + } + + // test queries that must be prefix queries and must find only one doc + for (int i = 0; i < matchOneDocPrefix.length; i++) { + for (int j = 0; j < matchOneDocPrefix[i].length; j++) { + String qtxt = matchOneDocPrefix[i][j]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("match 1 prefix: doc="+docs[i]+" qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + assertEquals(PrefixQuery.class, q.getClass()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(1,hits.length); + assertEquals(i,hits[0].doc); + } + } + + // test queries that must be wildcard queries and must find only one doc + for (int i = 0; i < matchOneDocPrefix.length; i++) { + for (int j = 0; j < matchOneDocWild[i].length; j++) { + String qtxt = matchOneDocWild[i][j]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("match 1 wild: doc="+docs[i]+" qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + assertEquals(AutomatonWildcardQuery.class, q.getClass()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(1,hits.length); + assertEquals(i,hits[0].doc); + } + } + + searcher.close(); + } + +}