Index: contrib/regex/build.xml
===================================================================
--- contrib/regex/build.xml (revision 725886)
+++ contrib/regex/build.xml (working copy)
@@ -24,7 +24,7 @@
+ * A RegexpFilter that utilizes the BRICS automaton package: http://www.brics.dk/automaton/
+ *
+ * The expression is converted to a DFA, and the state machine is used to optimize term enumeration.
+ *
+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied.
+ *
+ * Construct a new AutomatonFilter.
+ *
+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *
+ * @param term Term containing field and regular expression + */ + public AutomatonFilter(Term term) { + this(term, new RegExp(term.text()).toAutomaton()); + } + + /** + *+ * Construct a new Automaton Filter + *
+ * + * @param field Lucene field to query + * @param automaton Brics Automaton + */ + public AutomatonFilter(String field, Automaton automaton) { + this(new Term(field), automaton); + } + + /** + *+ * Construct a new Automaton Filter + *
+ * + * @param term Term for the Lucene field to query + * @param automaton Brics automaton + */ + protected AutomatonFilter(Term term, Automaton automaton) { + super(); + this.term = term; + this.automaton = automaton; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader) + */ + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + /* + * The algorithm here is pretty basic. Enumerate terms but instead of a binary accept/reject do: + * + * Look at the portion that is OK (did not enter a reject state in the DFA) + * Generate the next possible String and seek to that. + * + * Because this implementation is bounded by alphabet size, it could be slightly improved. + * One possibility is the use of Automaton State/Transition classes which provide character intervals. + * For large numbers of CJK terms where the "alphabet" is large, this might optimize things a bit better. + * + */ + OpenBitSet bits = new OpenBitSet(reader.maxDoc()); + String prefix = automaton.getCommonPrefix(); + RunAutomaton runAutomaton = new RunAutomaton(automaton); + + /* if there is a static prefix, why not start here, but probably not truly necessary */ + TermEnum enumerator = reader.terms(term.createTerm(prefix)); + TermDocs termDocs = reader.termDocs(); + + Term t = null; + while ((t = enumerator.term()) != null) { + + if (t.field() != term.field()) { /* wrong field, enumerated all the terms we need */ + break; + } + + String termText = t.text(); + + if (!termText.startsWith(prefix)) { /* we are done. this helps for a large alphabet */ + break; + } + + /* run the string against the automaton. + * Either the string is accepted, or it is rejected. + * When rejected, the acceptStatus contains the highest index that DID NOT go into a reject state. + */ + int acceptStatus = run(runAutomaton, termText); + if (acceptStatus == ACCEPTED) { + /* in this case, add all the docs and keep enumerating + */ + termDocs.seek(t); + while (termDocs.next()) + bits.set(termDocs.doc()); + enumerator.next(); + } else { + /* in this case, want to take the portion that wasn't rejected, and generate the next possible unicode string. + * instruct TermEnum to seek to that location. + */ + enumerator.close(); + Term next = term.createTerm(nextString(termText, acceptStatus)); + enumerator = reader.terms(next); + } + } + enumerator.close(); + termDocs.close(); + return bits; + } + + /** + * Helper function to generate the next possible Unicode String + * @param termText String value of term + * @param acceptStatus max character position that did not enter into a reject state + * @return next possible unicode String + */ + + private static final String nextString(String termText, int acceptStatus) { + if (termText.length() == 0) /* empty string */ + return "\u0000"; + + int boundary = acceptStatus + 1; + + StringBuffer prefix = new StringBuffer(); + prefix.append(termText.substring(0, boundary)); + int nextChar = -1; + if (boundary < termText.length()) + nextChar = termText.charAt(boundary); + /* U+FFFF is guaranteed not to ever be a valid unicode character so no overflow risk here */ + prefix.append((char)(nextChar + 1)); + + return prefix.toString(); + } + + + /** + *+ * Return the character position of the longest portion that doesn't enter a reject state. + * This method returns one of three values + * ACCEPTED (-2): This means this is a match, it ends in an accept state. + * -1: the first character entered a reject state, therefore no characters are "useful" + * n: where n is the position of the longest portion that did not enter a reject state. + *
+ * @param ra RunAutomaton + * @param s String + * @return index of the longest portion that doesn't enter reject state, or ACCEPTED, or -1 + */ + private static final int run(RunAutomaton ra, String s) { + int state = ra.getInitialState(); + int length = s.length(); + int max = -1; + for (int offset = 0; offset < length; offset++) { + state = ra.step(state, s.charAt(offset)); + if (state == -1) + break; + else + max++; + } + + if (state >= 0 && ra.isAccept(state)) + return ACCEPTED; + + return max; + } + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((automaton == null) ? 0 : automaton.hashCode()); + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final AutomatonFilter other = (AutomatonFilter) obj; + if (automaton == null) { + if (other.automaton != null) + return false; + } else if (!automaton.equals(other.automaton)) + return false; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } + +} Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0) @@ -0,0 +1,115 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.ToStringUtils; + +/** + *+ * A RegexpQuery that utilizes the BRICS automaton package: http://www.brics.dk/automaton/ + *
+ * + *+ * Queries are converted to a DFA, and the state machine is used to optimize term enumeration. + *
+ * Score is constant and equal to the boost. + * + * + *+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *
+ * + */ +public class AutomatonQuery extends Query { + protected Term term; + + /** + *+ * Construct a new AutomatonQuery. + *
+ * Term is expected to contain regex syntax compatible with the BRICS package: + * http://www.brics.dk/automaton/ + * + *+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *
+ * @param term Term containing field and regular expression + */ + public AutomatonQuery(Term term) { + this.term = term; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) + */ + public Query rewrite(IndexReader reader) throws IOException { + Query query = new ConstantScoreQuery(new AutomatonFilter(term)); + query.setBoost(getBoost()); + return query; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#toString(java.lang.String) + */ + public String toString(String field) { + StringBuffer buffer = new StringBuffer(); + buffer.append("automatonQuery("); + buffer.append(term); + buffer.append(")"); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } + + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final AutomatonQuery other = (AutomatonQuery) obj; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } + +} Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonWildcardQuery.java (revision 0) @@ -0,0 +1,133 @@ +package org.apache.lucene.search.regex; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util.ToStringUtils; + +import dk.brics.automaton.Automaton; +import dk.brics.automaton.BasicAutomata; +import dk.brics.automaton.BasicOperations; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + *+ * A Query that accepts the syntax of {@link WildcardQuery} but implements with an Automaton. + * This accelerates more wildcard cases. + * Most cases are accelerated such as ?(a|b)cd?e, but leading * is still slow. + *
+ *+ * Score is constant and equal to the boost + *
+ */ +public class AutomatonWildcardQuery extends Query { + protected Term term; + private boolean termContainsWildcard; + + /** + * Construct a new AutomatonWildcardQuery + * @param term query Term + */ + public AutomatonWildcardQuery(Term term) { + this.term = term; + this.termContainsWildcard = (term.text().indexOf('*') != -1) || (term.text().indexOf('?') != -1); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) + */ + public Query rewrite(IndexReader reader) throws IOException { + if (!termContainsWildcard) + return new TermQuery(term); + + List automata = new ArrayList(); + + String wildcardText = term.text(); + + for (int i = 0; i < wildcardText.length(); i++) { + char c = wildcardText.charAt(i); + switch(c) { + case '*': { + automata.add(BasicAutomata.makeAnyString()); + break; + } + case '?': { + automata.add(BasicAutomata.makeAnyChar()); + break; + } + default: { + automata.add(BasicAutomata.makeChar(c)); + } + } + } + + Automaton combined = BasicOperations.concatenate(automata); + Query query = new ConstantScoreQuery(new AutomatonFilter(term.field(), combined)); + query.setBoost(getBoost()); + return query; + } + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final AutomatonWildcardQuery other = (AutomatonWildcardQuery) obj; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#toString(java.lang.String) + */ + public String toString(String field) { + StringBuffer buffer = new StringBuffer(); + buffer.append("automatonWildcardQuery("); + buffer.append(term); + buffer.append(")"); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } +} Index: contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java =================================================================== --- contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0) +++ contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0) @@ -0,0 +1,87 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.RAMDirectory; + +public class TestAutomatonQuery extends TestCase { + private IndexSearcher searcher; + private final String FN = "field"; + + public void setUp() { + RAMDirectory directory = new RAMDirectory(); + try { + IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field(FN, "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(directory); + } catch (Exception e) { + fail(e.toString()); + } + } + + public void tearDown() { + try { + searcher.close(); + } catch (Exception e) { + fail(e.toString()); + } + } + + private Term newTerm(String value) { return new Term(FN, value); } + + private int regexQueryNrHits(String regex) throws Exception { + AutomatonQuery query = new AutomatonQuery( newTerm(regex)); + return searcher.search(query).length(); + } + + + public void testRegex1() throws Exception { + assertEquals(1, regexQueryNrHits("q.[aeiou]c.*")); + } + + public void testRegex2() throws Exception { + assertEquals(0, regexQueryNrHits(".[aeiou]c.*")); + } + + public void testRegex3() throws Exception { + assertEquals(0, regexQueryNrHits("q.[aeiou]c")); + } + + + public void testEquals() throws Exception { + RegexQuery query1 = new RegexQuery( newTerm("foo.*")); + + RegexQuery query2 = new RegexQuery( newTerm("foo.*")); + assertEquals(query1.equals(query2), true); + } + +} + Index: contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcard.java =================================================================== --- contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcard.java (revision 0) +++ contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonWildcard.java (revision 0) @@ -0,0 +1,272 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * TestAutomatonWildcard tests the '*' and '?' wildcard characters. + * + * shamelessly ripped from TestWildcaard + * + */ +public class TestAutomatonWildcard + extends LuceneTestCase { + public void testEquals() { + AutomatonWildcardQuery wq1 = new AutomatonWildcardQuery(new Term("field", "b*a")); + AutomatonWildcardQuery wq2 = new AutomatonWildcardQuery(new Term("field", "b*a")); + AutomatonWildcardQuery wq3 = new AutomatonWildcardQuery(new Term("field", "b*a")); + + // reflexive? + assertEquals(wq1, wq2); + assertEquals(wq2, wq1); + + // transitive? + assertEquals(wq2, wq3); + assertEquals(wq1, wq3); + + assertFalse(wq1.equals(null)); + + FuzzyQuery fq = new FuzzyQuery(new Term("field", "b*a")); + assertFalse(wq1.equals(fq)); + assertFalse(fq.equals(wq1)); + } + + /** + * Tests if a WildcardQuery that has no wildcard in the term is rewritten to a single + * TermQuery. + */ + public void testTermWithoutWildcard() throws IOException { + RAMDirectory indexStore = getIndexStore("field", new String[]{"nowildcard", "nowildcardx"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + + Query wq = new AutomatonWildcardQuery(new Term("field", "nowildcard")); + assertMatches(searcher, wq, 1); + + wq = searcher.rewrite(wq); + assertTrue(wq instanceof TermQuery); + } + + /** + * Tests Wildcard queries with an asterisk. + */ + public void testAsterisk() + throws IOException { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + Query query1 = new TermQuery(new Term("body", "metal")); + Query query2 = new AutomatonWildcardQuery(new Term("body", "metal*")); + Query query3 = new AutomatonWildcardQuery(new Term("body", "m*tal")); + Query query4 = new AutomatonWildcardQuery(new Term("body", "m*tal*")); + Query query5 = new AutomatonWildcardQuery(new Term("body", "m*tals")); + + BooleanQuery query6 = new BooleanQuery(); + query6.add(query5, BooleanClause.Occur.SHOULD); + + BooleanQuery query7 = new BooleanQuery(); + query7.add(query3, BooleanClause.Occur.SHOULD); + query7.add(query5, BooleanClause.Occur.SHOULD); + + // Queries do not automatically lower-case search terms: + Query query8 = new AutomatonWildcardQuery(new Term("body", "M*tal*")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 2); + assertMatches(searcher, query3, 1); + assertMatches(searcher, query4, 2); + assertMatches(searcher, query5, 1); + assertMatches(searcher, query6, 1); + assertMatches(searcher, query7, 2); + assertMatches(searcher, query8, 0); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tall")), 0); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tal")), 1); + assertMatches(searcher, new AutomatonWildcardQuery(new Term("body", "*tal*")), 2); + } + + /** + * Tests Wildcard queries with a question mark. + * + * @throws IOException if an error occurs + */ + public void testQuestionmark() + throws IOException { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals", "mXtals", "mXtXls"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + Query query1 = new AutomatonWildcardQuery(new Term("body", "m?tal")); + Query query2 = new AutomatonWildcardQuery(new Term("body", "metal?")); + Query query3 = new AutomatonWildcardQuery(new Term("body", "metals?")); + Query query4 = new AutomatonWildcardQuery(new Term("body", "m?t?ls")); + Query query5 = new AutomatonWildcardQuery(new Term("body", "M?t?ls")); + Query query6 = new AutomatonWildcardQuery(new Term("body", "meta??")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 1); + assertMatches(searcher, query3, 0); + assertMatches(searcher, query4, 3); + assertMatches(searcher, query5, 0); + assertMatches(searcher, query6, 1); // Query: 'meta??' matches 'metals' not 'metal' + } + + private RAMDirectory getIndexStore(String field, String[] contents) + throws IOException { + RAMDirectory indexStore = new RAMDirectory(); + IndexWriter writer = new IndexWriter(indexStore, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + for (int i = 0; i < contents.length; ++i) { + Document doc = new Document(); + doc.add(new Field(field, contents[i], Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + writer.optimize(); + writer.close(); + + return indexStore; + } + + private void assertMatches(IndexSearcher searcher, Query q, int expectedMatches) + throws IOException { + ScoreDoc[] result = searcher.search(q, null, 1000).scoreDocs; + assertEquals(expectedMatches, result.length); + } + + /** + * Test that wild card queries are parsed to the correct type and are searched correctly. + * This test looks at both parsing and execution of wildcard queries. + * Although placed here, it also tests prefix queries, verifying that + * prefix queries are not parsed into wild card queries, and viceversa. + * @throws Exception + */ + public void testParsingAndSearching() throws Exception { + String field = "content"; + boolean dbg = false; + QueryParser qp = new QueryParser(field, new WhitespaceAnalyzer()) { + + protected Query newWildcardQuery(Term t) { + return new AutomatonWildcardQuery(t); + } + + }; + + qp.setAllowLeadingWildcard(true); + String docs[] = { + "\\ abcdefg1", + "\\79 hijklmn1", + "\\\\ opqrstu1", + }; + // queries that should find all docs + String matchAll[] = { + "*", "*1", "**1", "*?", "*?1", "?*1", "**", "***", "\\\\*" + }; + // queries that should find no docs + String matchNone[] = { + "a*h", "a?h", "*a*h", "?a", "a?", + }; + // queries that should be parsed to prefix queries + String matchOneDocPrefix[][] = { + {"a*", "ab*", "abc*", }, // these should find only doc 0 + {"h*", "hi*", "hij*", "\\\\7*"}, // these should find only doc 1 + {"o*", "op*", "opq*", "\\\\\\\\*"}, // these should find only doc 2 + }; + // queries that should be parsed to wildcard queries + String matchOneDocWild[][] = { + {"*a*", "*ab*", "*abc**", "ab*e*", "*g?", "*f?1", "abc**"}, // these should find only doc 0 + {"*h*", "*hi*", "*hij**", "hi*k*", "*n?", "*m?1", "hij**"}, // these should find only doc 1 + {"*o*", "*op*", "*opq**", "op*q*", "*u?", "*t?1", "opq**"}, // these should find only doc 2 + }; + + // prepare the index + RAMDirectory dir = new RAMDirectory(); + IndexWriter iw = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + for (int i = 0; i < docs.length; i++) { + Document doc = new Document(); + doc.add(new Field(field,docs[i],Store.NO,Index.ANALYZED)); + iw.addDocument(doc); + } + iw.close(); + + IndexSearcher searcher = new IndexSearcher(dir); + + // test queries that must find all + for (int i = 0; i < matchAll.length; i++) { + String qtxt = matchAll[i]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("matchAll: qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(docs.length,hits.length); + } + + // test queries that must find none + for (int i = 0; i < matchNone.length; i++) { + String qtxt = matchNone[i]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("matchNone: qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(0,hits.length); + } + + // test queries that must be prefix queries and must find only one doc + for (int i = 0; i < matchOneDocPrefix.length; i++) { + for (int j = 0; j < matchOneDocPrefix[i].length; j++) { + String qtxt = matchOneDocPrefix[i][j]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("match 1 prefix: doc="+docs[i]+" qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + assertEquals(PrefixQuery.class, q.getClass()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(1,hits.length); + assertEquals(i,hits[0].doc); + } + } + + // test queries that must be wildcard queries and must find only one doc + for (int i = 0; i < matchOneDocPrefix.length; i++) { + for (int j = 0; j < matchOneDocWild[i].length; j++) { + String qtxt = matchOneDocWild[i][j]; + Query q = qp.parse(qtxt); + if (dbg) System.out.println("match 1 wild: doc="+docs[i]+" qtxt="+qtxt+" q="+q+" "+q.getClass().getName()); + assertEquals(AutomatonWildcardQuery.class, q.getClass()); + ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; + assertEquals(1,hits.length); + assertEquals(i,hits[0].doc); + } + } + + searcher.close(); + } + +}