Index: contrib/regex/build.xml
===================================================================
--- contrib/regex/build.xml (revision 725886)
+++ contrib/regex/build.xml (working copy)
@@ -24,7 +24,7 @@
+ * A RegexpFilter that utilizes the BRICS automaton package: http://www.brics.dk/automaton/
+ *
+ * The expression is converted to a DFA, and the state machine is used to optimize term enumeration.
+ *
+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied.
+ *
+ * Construct a new AutomatonFilter.
+ *
+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *
+ * @param term Term containing field and regular expression + */ + public AutomatonFilter(Term term) { + super(); + this.term = term; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader) + */ + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + /* + * The algorithm here is pretty basic. Enumerate terms but instead of a binary accept/reject do: + * + * Look at the portion that is OK (did not enter a reject state in the DFA) + * Generate the next possible String and seek to that. + * + * Because this implementation is bounded by alphabet size, it could be slightly improved. + * One possibility is the use of Automaton State/Transition classes which provide character intervals. + * For large numbers of CJK terms where the "alphabet" is large, this might optimize things a bit better. + * + */ + OpenBitSet bits = new OpenBitSet(reader.maxDoc()); + Automaton automaton = new RegExp(term.text()).toAutomaton(); + String prefix = automaton.getCommonPrefix(); + RunAutomaton runAutomaton = new RunAutomaton(automaton); + + /* if there is a static prefix, why not start here, but probably not truly necessary */ + TermEnum enumerator = reader.terms(term.createTerm(prefix)); + TermDocs termDocs = reader.termDocs(); + + Term t = null; + while ((t = enumerator.term()) != null) { + + if (t.field() != term.field()) { /* wrong field, enumerated all the terms we need */ + break; + } + + String termText = t.text(); + + /* run the string against the automaton. + * Either the string is accepted, or it is rejected. + * When rejected, the acceptStatus contains the highest index that DID NOT go into a reject state. + */ + int acceptStatus = run(runAutomaton, termText); + if (acceptStatus == ACCEPTED) { + /* in this case, add all the docs and keep enumerating + */ + termDocs.seek(t); + while (termDocs.next()) + bits.set(termDocs.doc()); + enumerator.next(); + } else { + /* in this case, want to take the portion that wasn't rejected, and generate the next possible unicode string. + * instruct TermEnum to seek to that location. + */ + enumerator.close(); + Term next = term.createTerm(nextString(termText, acceptStatus)); + enumerator = reader.terms(next); + } + } + enumerator.close(); + termDocs.close(); + return bits; + } + + /** + * Helper function to generate the next possible Unicode String + * @param termText String value of term + * @param acceptStatus max character position that did not enter into a reject state + * @return next possible unicode String + */ + + private static final String nextString(String termText, int acceptStatus) { + if (termText.length() == 0) /* empty string */ + return "\u0000"; + + int boundary = acceptStatus + 1; + + StringBuffer prefix = new StringBuffer(); + prefix.append(termText.substring(0, boundary)); + int nextChar = -1; + if (boundary < termText.length()) + nextChar = termText.charAt(boundary); + /* U+FFFF is guaranteed not to ever be a valid unicode character so no overflow risk here */ + prefix.append((char)(nextChar + 1)); + + return prefix.toString(); + } + + + /** + *+ * Return the character position of the longest portion that doesn't enter a reject state. + * This method returns one of three values + * ACCEPTED (-2): This means this is a match, it ends in an accept state. + * -1: the first character entered a reject state, therefore no characters are "useful" + * n: where n is the position of the longest portion that did not enter a reject state. + *
+ * @param ra RunAutomaton + * @param s String + * @return index of the longest portion that doesn't enter reject state, or ACCEPTED, or -1 + */ + private static final int run(RunAutomaton ra, String s) { + int state = ra.getInitialState(); + int length = s.length(); + int max = -1; + for (int offset = 0; offset < length; offset++) { + state = ra.step(state, s.charAt(offset)); + if (state == -1) + break; + else + max++; + } + + if (state >= 0 && ra.isAccept(state)) + return ACCEPTED; + + return max; + } + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final AutomatonFilter other = (AutomatonFilter) obj; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } + +} Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java =================================================================== --- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0) +++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0) @@ -0,0 +1,115 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.ToStringUtils; + +/** + *+ * A RegexpQuery that utilizes the BRICS automaton package: http://www.brics.dk/automaton/ + *
+ * + *+ * Queries are converted to a DFA, and the state machine is used to optimize term enumeration. + *
+ * Score is constant and equal to the boost. + * + * + *+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *
+ * + */ +public class AutomatonQuery extends Query { + protected Term term; + + /** + *+ * Construct a new AutomatonQuery. + *
+ * Term is expected to contain regex syntax compatible with the BRICS package: + * http://www.brics.dk/automaton/ + * + *+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied. + *
+ * @param term Term containing field and regular expression + */ + public AutomatonQuery(Term term) { + this.term = term; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) + */ + public Query rewrite(IndexReader reader) throws IOException { + Query query = new ConstantScoreQuery(new AutomatonFilter(term)); + query.setBoost(getBoost()); + return query; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#toString(java.lang.String) + */ + public String toString(String field) { + StringBuffer buffer = new StringBuffer(); + buffer.append("automatonQuery("); + buffer.append(term); + buffer.append(")"); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } + + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final AutomatonQuery other = (AutomatonQuery) obj; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } + +} Index: contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java =================================================================== --- contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0) +++ contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0) @@ -0,0 +1,89 @@ +package org.apache.lucene.search.regex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.IndexSearcher; + +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; + +public class TestAutomatonQuery extends TestCase { + private IndexSearcher searcher; + private final String FN = "field"; + + public void setUp() { + RAMDirectory directory = new RAMDirectory(); + try { + IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field(FN, "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(directory); + } catch (Exception e) { + fail(e.toString()); + } + } + + public void tearDown() { + try { + searcher.close(); + } catch (Exception e) { + fail(e.toString()); + } + } + + private Term newTerm(String value) { return new Term(FN, value); } + + private int regexQueryNrHits(String regex) throws Exception { + AutomatonQuery query = new AutomatonQuery( newTerm(regex)); + return searcher.search(query).length(); + } + + + public void testRegex1() throws Exception { + assertEquals(1, regexQueryNrHits("q.[aeiou]c.*")); + } + + public void testRegex2() throws Exception { + assertEquals(0, regexQueryNrHits(".[aeiou]c.*")); + } + + public void testRegex3() throws Exception { + assertEquals(0, regexQueryNrHits("q.[aeiou]c")); + } + + + public void testEquals() throws Exception { + RegexQuery query1 = new RegexQuery( newTerm("foo.*")); + + RegexQuery query2 = new RegexQuery( newTerm("foo.*")); + assertEquals(query1.equals(query2), true); + } + +} +