Index: LICENSE.txt =================================================================== --- LICENSE.txt (revision 887534) +++ LICENSE.txt (working copy) @@ -237,4 +237,34 @@ http://www.python.org/download/releases/2.4.2/license/ +Some code in src/java/org/apache/lucene/util/automaton was +derived from Brics automaton sources available at +www.brics.dk/automaton/. Here is the copyright from those sources: +/* + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + Index: NOTICE.txt =================================================================== --- NOTICE.txt (revision 887534) +++ NOTICE.txt (working copy) @@ -33,3 +33,6 @@ ICU4J, (under contrib/collation) is licensed under an MIT styles license (contrib/collation/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Brics Automaton (under src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders M?ller. See http://www.brics.dk/automaton/ Index: src/java/org/apache/lucene/search/AutomatonQuery.java =================================================================== --- src/java/org/apache/lucene/search/AutomatonQuery.java (revision 0) +++ src/java/org/apache/lucene/search/AutomatonQuery.java (revision 0) @@ -0,0 +1,159 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FilteredTermEnum; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.SingleTermEnum; +import org.apache.lucene.util.ToStringUtils; + +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.MinimizationOperations; +import org.apache.lucene.util.automaton.SpecialOperations; + +/** + * A {@link Query} that will match terms against a finite-state machine. + *

+ * This query will match documents that contain terms accepted by a given + * finite-state machine. The automaton can be constructed with the + * {@link org.apache.lucene.util.automaton} API. Alternatively, it can be + * created from a regular expression with {@link RegexpQuery} or from + * the standard Lucene wildcard syntax with {@link WildcardQuery}. + *

+ *

+ * When the query is executed, it will create an equivalent minimal DFA of the + * finite-state machine, and will enumerate the term dictionary in an + * intelligent way to reduce the number of comparisons. For example: the regular + * expression of [dl]og? will make approximately four comparisons: + * do, dog, lo, and log. + *

+ */ +public class AutomatonQuery extends MultiTermQuery { + /** the automaton to match index terms against */ + protected Automaton automaton; + /** term containing the field, and possibly some pattern structure */ + protected Term term; + + /** + * Create a new AutomatonQuery from an {@link Automaton}. + * + * @param term Term containing field and possibly some pattern structure. The + * term text is ignored. + * @param automaton Automaton to run, terms that are accepted are considered a + * match. + */ + public AutomatonQuery(Term term, Automaton automaton) { + super(); + this.term = term; + this.automaton = automaton; + MinimizationOperations.minimize(automaton); + } + + @Override + protected FilteredTermsEnum getTermsEnum(IndexReader reader) throws IOException { + if (BasicOperations.isEmpty(automaton)) { + return new EmptyTermsEnum(term.field()); + } + + // matches a fixed string in expanded or singleton representation + String commonPrefix = SpecialOperations.getCommonPrefix(automaton); + if (automaton.equals(BasicAutomata.makeString(commonPrefix))) { + return new SingleTermsEnum(reader, term.createTerm(commonPrefix)); + } + + Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata + .makeString(commonPrefix), BasicAutomata.makeAnyString()); + if (automaton.equals(prefixAutomaton)) { + return new PrefixTermsEnum(reader, term.createTerm(commonPrefix)); + } + + return new AutomatonTermsEnum(automaton, term, reader); + } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + // matches a fixed string in expanded or singleton representation + final String commonPrefix = SpecialOperations.getCommonPrefix(automaton); + if (automaton.equals(BasicAutomata.makeString(commonPrefix))) { + return new SingleTermEnum(reader, term.createTerm(commonPrefix)); + } + + Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata + .makeString(commonPrefix), BasicAutomata.makeAnyString()); + if (automaton.equals(prefixAutomaton)) { + return new PrefixTermEnum(reader, term.createTerm(commonPrefix)); + } + + return new AutomatonTermEnum(automaton, term, reader); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + ((automaton == null) ? 0 : automaton.hashCode()); + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (!super.equals(obj)) + return false; + if (getClass() != obj.getClass()) + return false; + AutomatonQuery other = (AutomatonQuery) obj; + if (automaton == null) { + if (other.automaton != null) + return false; + } else if (!automaton.equals(other.automaton)) + return false; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } + + @Override + public String toString(String field) { + StringBuilder buffer = new StringBuilder(); + if (!term.field().equals(field)) { + buffer.append(term.field()); + buffer.append(":"); + } + buffer.append(getClass().getSimpleName()); + buffer.append(" {"); + buffer.append('\n'); + buffer.append(automaton.toString()); + buffer.append("}"); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } +} Property changes on: src\java\org\apache\lucene\search\AutomatonQuery.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/search/AutomatonTermEnum.java =================================================================== --- src/java/org/apache/lucene/search/AutomatonTermEnum.java (revision 0) +++ src/java/org/apache/lucene/search/AutomatonTermEnum.java (revision 0) @@ -0,0 +1,512 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FilteredTermEnum; + +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.SpecialOperations; + +/** + * A FilteredTermEnum that enumerates terms based upon what is accepted by a + * DFA. + *

+ * The algorithm is such: + *

    + *
  1. As long as matches are successful, keep reading sequentially. + *
  2. When a match fails, skip to the next string in lexicographic order that + * does not enter a reject state. + *
+ *

+ * The algorithm does not attempt to actually skip to the next string that is + * completely accepted. This is not possible when the language accepted by the + * FSM is not finite (i.e. * operator). + *

+ *

+ * If the DFA has a leading kleene star, or something similar, it will + * need to run against the entire term dictionary. In this case its much + * better to do just that than to use smart enumeration. + * This heuristic looks for an initial loop, with a range of at least 1/3 + * of the unicode BMP. + * Use {@link #usesLinearMode} to find out if it enumerates all terms + * in linear mode without seeking. + *

+ *

+ * WARNING: The status of the Automaton feature is + * experimental. The APIs introduced here might change in the future and will + * not be supported anymore in such a case. + *

+ */ +public class AutomatonTermEnum extends FilteredTermEnum { + private final IndexReader reader; + // the term used for querying, not used here. + private final Term queryTerm; + // the object-oriented form of the DFA + private final Automaton automaton; + // a tableized array-based form of the DFA + private final RunAutomaton runAutomaton; + // true if this enum will not seek around + private final boolean linearMode; + // common suffix of the automaton + private final String commonSuffix; + // the last term that was compared + private Term lastTerm = null; + // for linear mode, true if we have reached the end of enumeration + private boolean endEnum = false; + // true if the automaton accepts a finite language + private final boolean finite; + // for complex machines that must make a lot of comparisons + private final Map transitionCache; + + /** + * Construct an enumerator based upon an automaton, enumerating the specified + * field, working on a supplied reader. + *

+ * The parameter linearMode determines whether or not it will use smart enumeration. + */ + public AutomatonTermEnum(Automaton automaton, Term queryTerm, + IndexReader reader, boolean linearMode) throws IOException { + super(); + this.reader = reader; + this.queryTerm = queryTerm; + this.automaton = automaton; + this.linearMode = linearMode; + + /* + * tableize the automaton. this also ensures it is deterministic, and has no + * transitions to dead states. + */ + runAutomaton = new RunAutomaton(this.automaton); + + if (this.linearMode) { + // iterate all terms in linear mode + this.finite = false; + transitionCache = null; + lastTerm = queryTerm.createTerm(""); + commonSuffix = cleanupSuffix(SpecialOperations.getCommonSuffix(automaton)); + } else { + // if the automaton is finite, we will never read sequentially, but always seek. + this.finite = SpecialOperations.isFinite(this.automaton); + // in nonlinear mode, the common suffix isn't that helpful. + // we will seek each time anyway (and take the unicode conversion hit). + // its also currently expensive to calculate, because getCommonSuffix is + // a bit expensive. + commonSuffix = ""; + // build a cache of sorted transitions for every state + transitionCache = new HashMap(runAutomaton.getSize()); + for (State state : this.automaton.getStates()) { + List transitions = state.getSortedTransitions(false); + transitionCache.put(state, transitions + .toArray(new Transition[transitions.size()])); + } + + String startPoint = cleanupPosition(nextString("")); + + /* + * in this case this automaton will not accept any strings. start the + * enumeration at the empty string, next() will return false. + */ + if (startPoint == null) { + startPoint = ""; + } + + lastTerm = queryTerm.createTerm(startPoint); + } + + setEnum(reader.terms(lastTerm)); + } + + /** + * Construct an enumerator based upon an automaton, enumerating the specified + * field, working on a supplied reader. + *

+ * It will automagically determine whether or not to enumerate the term dictionary + * in a smart way, or to just do a linear scan depending upon a heuristic. + */ + public AutomatonTermEnum(Automaton automaton, Term queryTerm, IndexReader reader) + throws IOException { + this(automaton, queryTerm, reader, AutomatonTermEnum.isSlow(automaton)); + } + + /** + * Heuristic to detect if an automaton will be so slow, + * that it is better to do a linear enumeration. + *

+ * A very slow automaton will simply cause a lot of wasted disk seeks. + * Instead in that case it is actually faster to do a linear enumeration. + *

+ * @param automaton automaton + * @return true if it will result in bad search performance + */ + private static boolean isSlow(Automaton automaton) { + /* + * If the DFA has a leading kleene star, or something similar, it will + * need to run against the entire term dictionary. In this case its much + * better to do just that than to use smart enumeration. + * + * this heuristic looks for an initial loop, with a range of at least 1/3 + * of the unicode BMP. + */ + State initialState = automaton.getInitialState(); + boolean linearMode = false; + for (Transition transition : initialState.getTransitions()) { + if (transition.getDest() == initialState && + (transition.getMax() - transition.getMin()) > (Character.MAX_VALUE / 3)) { + linearMode = true; + break; + } + } + return linearMode; + } + + /** + * Returns {@code true} if the enum is in linear mode, {@code false} in smart mode. + */ + public final boolean usesLinearMode() { + return linearMode; + } + + @Override + public float difference() { + return 1.0f; + } + + /** + * Returns true if the term matches the automaton. Also stashes away the term + * to assist with smart enumeration. + *

In linear mode, it also sets {@link #endEnum} if the enumeration is exhausted. + * In smart mode, it will never do this. + */ + @Override + protected boolean termCompare(final Term term) { + lastTerm = term; + final String text = term.text(); + if (term.field() == queryTerm.field()) { + return (!linearMode || text.endsWith(commonSuffix)) && runAutomaton.run(text); + } else { + // only set endEnum in linearMode + endEnum = linearMode; + return false; + } + } + + /** + * In smart mode, increments to the next term matching this automaton. + * After a successful comparison, it simply tries the next term. + * After an unsuccessful comparison, it seeks to a smarter position. + *

If the enum is in linear mode, it simply calls {@code super.next()} to + * just filter the current enum until {@link #endEnum} returns {@code true}. + */ + @Override + public boolean next() throws IOException { + if (linearMode) + return super.next(); + + do { + /* + * if the previous enumeration was a match, don't even bother + * trying to compute the next place to seek to. + * this is an optimization for a DFA that matches many sequential terms, + * such as ab* + * we only do this if the automaton is infinite. + */ + if (!finite && lastTerm == currentTerm) { + actualEnum.next(); + } else { + // seek to the next possible string + String nextPoint = nextString(lastTerm.text()); + if (nextPoint == null) { // no more possible strings can match + currentTerm = null; + endEnum = true; + return false; + } + // replace the old enumerator with a new one, positioned to a nice place + actualEnum.close(); + actualEnum = reader.terms(lastTerm.createTerm(nextPoint)); + } + + Term candidateTerm = actualEnum.term(); // read a term + + /* + * this means end of enumeration: no more terms for this field or no more + * terms at all + */ + if (candidateTerm == null || candidateTerm.field() != queryTerm.field()) { + currentTerm = null; + endEnum = true; + return false; + } + + // if the term matches the automaton, success! + if (termCompare(candidateTerm)) { + currentTerm = candidateTerm; + return true; + } + } while (true); + } + + /** + * This method should only be called in linear mode, in smart + * mode the result is undefined, as the handling of exhausted enums + * is done inside {@link #next}. + */ + @Override + protected boolean endEnum() { + assert linearMode : "endEnum() should only be called in linear mode"; + return endEnum; + } + + /** + * Returns the next String in lexicographic order after s that will not put + * the machine into a reject state. If such a string does not exist, returns + * null. + * + * The correctness of this method depends upon the automaton being deterministic, + * and having no transitions to dead states. + * + * @param s input String + * @return next valid String + */ + private String nextString(String s) { + State state; + int pos = 0; + + while (true) { + state = automaton.getInitialState(); + // walk the automaton until a character is rejected. + for (pos = 0; pos < s.length(); pos++) { + State nextState = step(state, s.charAt(pos)); + if (nextState == null) + break; + else + state = nextState; + } + + // take the useful portion, and the last non-reject state, and attempt to + // append characters that will match. + String nextString = nextString(s, state, pos); + if (nextString != null) { + return cleanupPosition(nextString); + } else { /* no more solutions exist from this useful portion, backtrack */ + String sprime = backtrack(s, pos); + if (sprime == null) /* no more solutions at all */ + return null; + else if (runAutomaton.run(sprime)) /* String is good to go as-is */ + return cleanupPosition(sprime); + else /* advance further */ + s = sprime; + } + } + } + + /** + * Returns the next String in lexicographic order after s that will not put + * the machine into a reject state. Appends some characters to the useful + * portion. If this cannot satisfy the machine, returns null. This method will + * walk the minimal path, in lexicographic order, as long as possible. + * + * @param s input String + * @param state current non-reject state + * @param useful useful portion of the string + * @return next valid String + */ + private String nextString(String s, State state, int useful) { + /* + * the next lexicographic character must be greater than the existing + * character, if it exists. + */ + char c = 0; + if (useful < s.length()) { + c = s.charAt(useful); + // if the next character is U+FFFF and is not part of the useful portion, + // then by definition it puts us in a reject state, and therefore this + // path is dead. there cannot be any higher transitions. backtrack. + if (c == '\uFFFF') + return null; + else + c++; + } + + StringBuilder sb = new StringBuilder(); + // append the useful portion + sb.append(s, 0, useful); + + Set visited = new HashSet(); + visited.add(state); + + Transition transitions[] = getTransitions(state); + + // find the minimal path (lexicographic order) that is >= c + + for (int i = 0; i < transitions.length; i++) { + Transition transition = transitions[i]; + if (transition.getMax() >= c) { + char nextChar = (char) Math.max(c, transition.getMin()); + sb.append(nextChar); + state = transition.getDest(); + /* + * as long as is possible, continue down the minimal path in + * lexicographic order. if a loop or accept state is encountered, stop. + */ + while (!visited.contains(state) && !state.isAccept()) { + visited.add(state); + /* + * Note: we work with a DFA with no transitions to dead states. + * so the below is ok, if it is not an accept state, + * then there MUST be at least one transition. + */ + transition = getTransitions(state)[0]; + sb.append(transition.getMin()); + state = transition.getDest(); + } + return sb.toString(); + } + } + return null; + } + + /** + * Backtrack thru the string after encountering a dead end. + * + * @param s input String + * @param useful useful portion of the string + * @return next valid String to evaluate against the DFA, or null + */ + private String backtrack(String s, int useful) { + while (useful > 0) { + char nextChar = s.charAt(useful - 1); + // if a character is U+FFFF its a dead-end too, + // because there is no higher character in UTF-16 sort order. + if (nextChar != '\uFFFF') { + nextChar++; + return s.substring(0, useful - 1) + nextChar; + } + useful--; + } + return null; /* all solutions exhausted */ + } + + /** + * Get the cached set of transitions for a state. + */ + private Transition[] getTransitions(State state) { + return transitionCache.get(state); + } + + /** + * Step the state machine forward one character, + * using cached transitions. + */ + private State step(State state, char c) { + Transition transitions[] = getTransitions(state); + for (int i = 0; i < transitions.length; i++) + if (transitions[i].getMin() <= c && c <= transitions[i].getMax()) + return transitions[i].getDest(); + return null; + } + + /** + * if the seek position cannot be converted to valid UTF-8, + * then return the next valid String (in UTF-16 sort order) that + * can be converted to valid UTF-8. + */ + private String cleanupPosition(String position) { + if (position != null) { + StringBuilder sb = new StringBuilder(); + + for (int i = 0; i < position.length(); i++) { + final char ch = position.charAt(i); + if (Character.isHighSurrogate(ch)) { + if (i + 1 < position.length()) { + + final char ch2 = position.charAt(i + 1); + if (ch2 < Character.MIN_LOW_SURROGATE) { + // invalid case #1, initial or medial in term + // high paired with invalid low, bump the next char up to MIN_LOW + sb.append(ch); + sb.append(Character.MIN_LOW_SURROGATE); + return sb.toString(); + } else if (ch2 > Character.MAX_LOW_SURROGATE) { + // invalid case #2, initial or medial in term + // high paired with invalid low, but its past the boundary. + // this means all supp. characters have been enumerated. + // ditch both the chars, replace with the first valid codepoint + // after the surrogate range. + sb.append((char)(Character.MAX_LOW_SURROGATE + 1)); + return sb.toString(); + } else { + sb.append(ch); + } + + } else { + // invalid case #3, final in term + // unpaired high, tack on MIN_LOW + sb.append(ch); + sb.append(Character.MIN_LOW_SURROGATE); + return sb.toString(); + } + } else if (i > 0 && Character.isLowSurrogate(ch)) { + final char ch1 = position.charAt(i - 1); + if (Character.isHighSurrogate(ch1)) { + sb.append(ch); + } else { + // invalid case #4, medial unpaired low. bump past the boundary. + sb.append((char)(Character.MAX_LOW_SURROGATE + 1)); + return sb.toString(); + } + } else if (Character.isLowSurrogate(ch)){ + // invalid case #5, initial unpaired low. bump past the boundary. + sb.append((char)(Character.MAX_LOW_SURROGATE + 1)); + return sb.toString(); + } else { + sb.append(ch); + } + } + return sb.toString(); + } else + return null; + } + + /** + * if the suffix starts with a low surrogate, remove it. + * This won't be quite as efficient, but can be converted to valid UTF-8 + * + * This isn't nearly as complex as cleanupPosition, because its not + * going to use this suffix to walk any path thru the terms. + * + */ + private String cleanupSuffix(String suffix) { + if (suffix != null && suffix.length() > 0 && + Character.isLowSurrogate(suffix.charAt(0))) + return suffix.substring(1); + else + return suffix; + } +} Property changes on: src\java\org\apache\lucene\search\AutomatonTermEnum.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/search/AutomatonTermsEnum.java =================================================================== --- src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 0) +++ src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 0) @@ -0,0 +1,467 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermRef; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.Terms; + +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.SpecialOperations; + +/** + * A FilteredTermsEnum that enumerates terms based upon what is accepted by a + * DFA. + *

+ * The algorithm is such: + *

    + *
  1. As long as matches are successful, keep reading sequentially. + *
  2. When a match fails, skip to the next string in lexicographic order that + * does not enter a reject state. + *
+ *

+ * The algorithm does not attempt to actually skip to the next string that is + * completely accepted. This is not possible when the language accepted by the + * FSM is not finite (i.e. * operator). + *

+ *

+ * If the DFA has a leading kleene star, or something similar, it will + * need to run against the entire term dictionary. In this case its much + * better to do just that than to use smart enumeration. + * This heuristic looks for an initial loop, with a range of at least 1/3 + * of the unicode BMP. + * Use {@link #usesLinearMode} to find out if it enumerates all terms + * in linear mode without seeking. + *

+ *

+ * WARNING: The status of the Automaton feature is + * experimental. The APIs introduced here might change in the future and will + * not be supported anymore in such a case. + *

+ */ +public class AutomatonTermsEnum extends FilteredTermsEnum { + // the object-oriented form of the DFA + private final Automaton automaton; + // a tableized array-based form of the DFA + private final RunAutomaton runAutomaton; + // true if this enum will not seek around + private final boolean linearMode; + // common suffix of the automaton + private final TermRef commonSuffixRef; + // true if the automaton accepts a finite language + private final boolean finite; + // for complex machines that must make a lot of comparisons + private final Map transitionCache; + // used for unicode conversion from TermRef byte[] to char[] + private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); + // the reference used for seeking forwards through the term dictionary + private final TermRef seekTermRef = new TermRef(); + // the field being enumerated + private final String field; + + private boolean uninitialized = true; + + // this accept stati will be returned by accept() dependent on internal mode + private final AcceptStatus NO_MATCH, YES_MATCH; + + /** + * Construct an enumerator based upon an automaton, enumerating the specified + * field, working on a supplied reader. + *

+ * The parameter linearMode determines whether or not it will use smart enumeration. + */ + AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader, boolean linearMode) + throws IOException { + super(reader, queryTerm.field()); + this.automaton = automaton; + field = queryTerm.field(); + this.linearMode = linearMode; + + /* + * tableize the automaton. this also ensures it is deterministic, and has no + * transitions to dead states. + */ + runAutomaton = new RunAutomaton(this.automaton); + + if (this.linearMode) { + // iterate all terms in linear mode + this.finite = false; + transitionCache = null; + commonSuffixRef = new TermRef(cleanupSuffix(SpecialOperations.getCommonSuffix(automaton))); + NO_MATCH = AcceptStatus.NO; + YES_MATCH = AcceptStatus.YES; + } else { + // if the automaton is finite, we will never read sequentially, but always seek. + this.finite = SpecialOperations.isFinite(this.automaton); + // in nonlinear mode, the common suffix isn't that helpful. + // we will seek each time anyway (and take the unicode conversion hit). + // its also currently expensive to calculate, because getCommonSuffix is + // a bit expensive. + commonSuffixRef = new TermRef(""); + // build a cache of sorted transitions for every state + transitionCache = new HashMap(runAutomaton.getSize()); + for (org.apache.lucene.util.automaton.State state : this.automaton.getStates()) { + List transitions = state.getSortedTransitions(false); + transitionCache.put(state, transitions.toArray(new Transition[transitions.size()])); + } + + NO_MATCH = AcceptStatus.NO_AND_SEEK; + YES_MATCH = finite ? AcceptStatus.YES_AND_SEEK : AcceptStatus.YES; + } + } + + /** + * Construct an enumerator based upon an automaton, enumerating the specified + * field, working on a supplied reader. + *

+ * It will automagically determine whether or not to enumerate the term dictionary + * in a smart way, or to just do a linear scan depending upon a heuristic. + */ + public AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader) + throws IOException { + this(automaton, queryTerm, reader, AutomatonTermsEnum.isSlow(automaton)); + } + + /** + * Heuristic to detect if an automaton will be so slow, + * that it is better to do a linear enumeration. + *

+ * A very slow automaton will simply cause a lot of wasted disk seeks. + * Instead in that case it is actually faster to do a linear enumeration. + *

+ * @param automaton automaton + * @return true if it will result in bad search performance + */ + private static boolean isSlow(Automaton automaton) { + /* + * If the DFA has a leading kleene star, or something similar, it will + * need to run against the entire term dictionary. In this case its much + * better to do just that than to use smart enumeration. + * + * this heuristic looks for an initial loop, with a range of at least 1/3 + * of the unicode BMP. + */ + org.apache.lucene.util.automaton.State initialState = automaton.getInitialState(); + boolean linearMode = false; + for (Transition transition : initialState.getTransitions()) { + if (transition.getDest() == initialState && + (transition.getMax() - transition.getMin()) > (Character.MAX_VALUE / 3)) { + linearMode = true; + break; + } + } + return linearMode; + } + + /** + * Returns {@code true} if the enum is in linear mode, {@code false} in smart mode. + */ + public final boolean usesLinearMode() { + return linearMode; + } + + @Override + public float difference() { + return 1.0f; + } + + /** + * Returns true if the term matches the automaton. Also stashes away the term + * to assist with smart enumeration. + *

In linear mode, it also sets {@link #endEnum} if the enumeration is exhausted. + * In smart mode, it will never do this. + */ + @Override + protected AcceptStatus accept(final TermRef term) { + if (term.endsWith(commonSuffixRef)) { + UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + return runAutomaton.run(utf16.result, 0, utf16.length) ? YES_MATCH : NO_MATCH; + } else { + return NO_MATCH; + } + } + + @Override + protected TermRef nextSeekTerm(final boolean enumExhausted) throws IOException { + if (enumExhausted) + return null; + if (uninitialized) { + uninitialized = false; + // return the first seek term + if (linearMode) { + seekTermRef.copy(""); + } else { + final String firstPoint = nextString(""); + if (firstPoint == null) + return null; + seekTermRef.copy(firstPoint); + } + return seekTermRef; + } else if (!linearMode) { + // seek to the next possible string + final String nextPoint = nextString(tenum.term().toString()); + if (nextPoint != null) { + // reposition + seekTermRef.copy(nextPoint); + return seekTermRef; + } + } + // no more possible strings can match + return null; + } + + /** + * Returns the next String in lexicographic order after s that will not put + * the machine into a reject state. If such a string does not exist, returns + * null. + * + * The correctness of this method depends upon the automaton being deterministic, + * and having no transitions to dead states. + * + * @param s input String + * @return next valid String + */ + private String nextString(String s) { + org.apache.lucene.util.automaton.State state; + int pos = 0; + + while (true) { + state = automaton.getInitialState(); + // walk the automaton until a character is rejected. + for (pos = 0; pos < s.length(); pos++) { + org.apache.lucene.util.automaton.State nextState = step(state, s.charAt(pos)); + if (nextState == null) + break; + else + state = nextState; + } + + // take the useful portion, and the last non-reject state, and attempt to + // append characters that will match. + String nextString = nextString(s, state, pos); + if (nextString != null) { + return cleanupPosition(nextString); + } else { /* no more solutions exist from this useful portion, backtrack */ + String sprime = backtrack(s, pos); + if (sprime == null) /* no more solutions at all */ + return null; + else if (runAutomaton.run(sprime)) /* String is good to go as-is */ + return cleanupPosition(sprime); + else /* advance further */ + s = sprime; + } + } + } + + /** + * Returns the next String in lexicographic order after s that will not put + * the machine into a reject state. Appends some characters to the useful + * portion. If this cannot satisfy the machine, returns null. This method will + * walk the minimal path, in lexicographic order, as long as possible. + * + * @param s input String + * @param state current non-reject state + * @param useful useful portion of the string + * @return next valid String + */ + private String nextString(String s, org.apache.lucene.util.automaton.State state, int useful) { + /* + * the next lexicographic character must be greater than the existing + * character, if it exists. + */ + char c = 0; + if (useful < s.length()) { + c = s.charAt(useful); + // if the next character is U+FFFF and is not part of the useful portion, + // then by definition it puts us in a reject state, and therefore this + // path is dead. there cannot be any higher transitions. backtrack. + if (c == '\uFFFF') + return null; + else + c++; + } + + StringBuilder sb = new StringBuilder(); + // append the useful portion + sb.append(s, 0, useful); + + Set visited = new HashSet(); + visited.add(state); + + Transition transitions[] = getTransitions(state); + + // find the minimal path (lexicographic order) that is >= c + + for (int i = 0; i < transitions.length; i++) { + Transition transition = transitions[i]; + if (transition.getMax() >= c) { + char nextChar = (char) Math.max(c, transition.getMin()); + sb.append(nextChar); + state = transition.getDest(); + /* + * as long as is possible, continue down the minimal path in + * lexicographic order. if a loop or accept state is encountered, stop. + */ + while (!visited.contains(state) && !state.isAccept()) { + visited.add(state); + /* + * Note: we work with a DFA with no transitions to dead states. + * so the below is ok, if it is not an accept state, + * then there MUST be at least one transition. + */ + transition = getTransitions(state)[0]; + sb.append(transition.getMin()); + state = transition.getDest(); + } + return sb.toString(); + } + } + return null; + } + + /** + * Backtrack thru the string after encountering a dead end. + * + * @param s input String + * @param useful useful portion of the string + * @return next valid String to evaluate against the DFA, or null + */ + private String backtrack(String s, int useful) { + while (useful > 0) { + char nextChar = s.charAt(useful - 1); + // if a character is U+FFFF its a dead-end too, + // because there is no higher character in UTF-16 sort order. + if (nextChar != '\uFFFF') { + nextChar++; + return s.substring(0, useful - 1) + nextChar; + } + useful--; + } + return null; /* all solutions exhausted */ + } + + /** + * Get the cached set of transitions for a state. + */ + private Transition[] getTransitions(org.apache.lucene.util.automaton.State state) { + return transitionCache.get(state); + } + + /** + * Step the state machine forward one character, + * using cached transitions. + */ + private org.apache.lucene.util.automaton.State step( + org.apache.lucene.util.automaton.State state, char c) { + Transition transitions[] = getTransitions(state); + for (int i = 0; i < transitions.length; i++) + if (transitions[i].getMin() <= c && c <= transitions[i].getMax()) + return transitions[i].getDest(); + return null; + } + + /** + * if the seek position cannot be converted to valid UTF-8, + * then return the next valid String (in UTF-16 sort order) that + * can be converted to valid UTF-8. + */ + private String cleanupPosition(String position) { + if (position != null) { + StringBuilder sb = new StringBuilder(); + + for (int i = 0; i < position.length(); i++) { + final char ch = position.charAt(i); + if (Character.isHighSurrogate(ch)) { + if (i + 1 < position.length()) { + + final char ch2 = position.charAt(i + 1); + if (ch2 < Character.MIN_LOW_SURROGATE) { + // invalid case #1, initial or medial in term + // high paired with invalid low, bump the next char up to MIN_LOW + sb.append(ch); + sb.append(Character.MIN_LOW_SURROGATE); + return sb.toString(); + } else if (ch2 > Character.MAX_LOW_SURROGATE) { + // invalid case #2, initial or medial in term + // high paired with invalid low, but its past the boundary. + // this means all supp. characters have been enumerated. + // ditch both the chars, replace with the first valid codepoint + // after the surrogate range. + sb.append((char)(Character.MAX_LOW_SURROGATE + 1)); + return sb.toString(); + } else { + sb.append(ch); + } + + } else { + // invalid case #3, final in term + // unpaired high, tack on MIN_LOW + sb.append(ch); + sb.append(Character.MIN_LOW_SURROGATE); + return sb.toString(); + } + } else if (i > 0 && Character.isLowSurrogate(ch)) { + final char ch1 = position.charAt(i - 1); + if (Character.isHighSurrogate(ch1)) { + sb.append(ch); + } else { + // invalid case #4, medial unpaired low. bump past the boundary. + sb.append((char)(Character.MAX_LOW_SURROGATE + 1)); + return sb.toString(); + } + } else if (Character.isLowSurrogate(ch)){ + // invalid case #5, initial unpaired low. bump past the boundary. + sb.append((char)(Character.MAX_LOW_SURROGATE + 1)); + return sb.toString(); + } else { + sb.append(ch); + } + } + return sb.toString(); + } else + return null; + } + + /** + * if the suffix starts with a low surrogate, remove it. + * This won't be quite as efficient, but can be converted to valid UTF-8 + * + * This isn't nearly as complex as cleanupPosition, because its not + * going to use this suffix to walk any path thru the terms. + * + */ + private String cleanupSuffix(String suffix) { + if (suffix != null && suffix.length() > 0 && + Character.isLowSurrogate(suffix.charAt(0))) + return suffix.substring(1); + else + return suffix; + } +} Property changes on: src\java\org\apache\lucene\search\AutomatonTermsEnum.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/search/EmptyTermsEnum.java =================================================================== --- src/java/org/apache/lucene/search/EmptyTermsEnum.java (revision 0) +++ src/java/org/apache/lucene/search/EmptyTermsEnum.java (revision 0) @@ -0,0 +1,62 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.TermRef; +import org.apache.lucene.index.TermsEnum; + +/** + * Subclass of {@code FilteredTermsEnum} that is always empty. + *

+ * This can be used by {@link MultiTermQuery}s (if no terms can ever match the query), + * but want to preserve MultiTermQuery semantics such as + * {@link MultiTermQuery#rewriteMethod}. + */ +public final class EmptyTermsEnum extends FilteredTermsEnum { + + /** + * Creates a new EmptyTermsEnum. + */ + public EmptyTermsEnum(final String field) { + super((TermsEnum) null, field); + } + + @Override + /** Always returns {@link AcceptStatus#END}. */ + protected AcceptStatus accept(TermRef term) { + return AcceptStatus.END; + } + + @Override + public float difference() { + return 1.0F; + } + + /** Always returns {@link SeekStatus#END}. */ + @Override + public SeekStatus seek(TermRef term) { + return SeekStatus.END; + } + + /** Always returns {@link SeekStatus#END}. */ + @Override + public SeekStatus seek(long ord) { + return SeekStatus.END; + } + +} Property changes on: src\java\org\apache\lucene\search\EmptyTermsEnum.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/search/FilteredTermsEnum.java =================================================================== --- src/java/org/apache/lucene/search/FilteredTermsEnum.java (revision 887534) +++ src/java/org/apache/lucene/search/FilteredTermsEnum.java (working copy) @@ -18,153 +18,218 @@ */ import java.io.IOException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermRef; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Bits; /** * Abstract class for enumerating a subset of all terms. - * - *

On creation, the enumerator must already be positioned - * to the first term.

* - *

Term enumerations are always ordered by - * Term.compareTo(). Each term in the enumeration is + *

Term enumerations should be always ordered by + * {@link #getTermComparator}. Each term in the enumeration is * greater than all that precede it.

-*/ + *

This enum cannot guarantee this, if you use seeking mode + * (override {@link #nextSeekTerm}) and provide + * terms to seek out of order and not greater + * than the current term. All {@code FilteredTermsEnum} + * in Lucene Core support this.

+ *

For {@link MultiTermQuery}, the order is not + * important, but public subclasses should be ordered. + */ public abstract class FilteredTermsEnum extends TermsEnum { - protected static enum AcceptStatus {YES, NO, END}; + private final String field; - /** the delegate enum - to set this member use {@link #setEnum} */ - protected TermsEnum actualEnum; + private TermRef initialSeekTerm = null; + private boolean doSeek = true, exhausted = false; + + protected final TermsEnum tenum; + + /** Return value, if term should be accepted or the iteration should + * {@code END}. The {@code *_SEEK} values denote, that after handling the current term + * the enum should call {@link nextSeekTerm()} and step forward. + * @see #accept(TermRef) + */ + protected static enum AcceptStatus {YES, YES_AND_SEEK, NO, NO_AND_SEEK, END}; + + /** Return if term is accepted, not accepted or the iteration should ended + * (and possibly seek). + */ + protected abstract AcceptStatus accept(TermRef term) throws IOException; - /** Return true if term is accepted */ - protected abstract AcceptStatus accept(TermRef term); - - /** Equality measure on the term */ - public abstract float difference(); + /** Equality measure on the term, used by {@link FuzzyQuery} and + * scoring {@link MultiTermQuery}. */ + public abstract float difference() throws IOException; - public abstract String field(); + /** + * Creates a filtered {@link TermsEnum} for the given field name and reader. + */ + public FilteredTermsEnum(final IndexReader reader, final String field) throws IOException { + this.field = field; + final Terms terms = reader.fields().terms(field); + tenum = (terms != null) ? terms.iterator() : null; + } - /** Only called once, right after construction, to check - * whether there are no matching terms */ - public abstract boolean empty(); + /** + * Creates a filtered {@link TermsEnum} on a terms enum for the given field name. + * @param tenum the terms enumeration to filter, if {@code null} this is the null iterator. + * @param field the field name this enum operates on (needed by {@link MultiTermQuery}). + */ + public FilteredTermsEnum(final TermsEnum tenum, final String field) { + this.tenum = tenum; + this.field = field; + } /** - * use this method to set the actual TermsEnum (e.g. in ctor), - * it will be automatically positioned on the first - * accepted term, and returns the term found or null if - * there is no matching term. + * Use this method to set the initial {@link TermRef} + * to seek before iterating. This is a convenience method for + * subclasses that do not override {@link #nextSeekTerm}. + * If the initial seek term is {@code null} (default), + * the enum is empty. + *

You can only use this method, if you keep the default + * implementation of {@link #nextSeekTerm}. */ - protected TermRef setEnum(TermsEnum actualEnum, TermRef term) throws IOException { - this.actualEnum = actualEnum; + protected final void setInitialSeekTerm(TermRef term) throws IOException { + this.initialSeekTerm = term; + } + + /** On the first call to {@link #next} or if {@link #accept} returns + * {@link AcceptStatus#YES_AND_SEEK} or {@link AcceptStatus#NO_AND_SEEK}, + * this method will be called to eventually seek the underlying TermsEnum + * to a new position. + * This method returns per default only one time the initial seek term + * and then {@code null}, so no repositioning is ever done. + *

Override this method, if you want a more sophisticated TermsEnum, + * that repositions the iterator during enumeration. + * If the {@code enumExhausted} parameter is {@code true}, the underlying + * enumeration is already exhausted and you do not need to return further terms + * (see below). + * If this method always returns {@code null} the enum is empty. + *

Please note: This method should always provide a greater term + * than the last enumerated term, else the behaviour of this enum + * violates the contract for TermsEnums. So you are allowed to return new + * terms for {@code enumExhausted == true}, but that would seek eventually + * backwards. + */ + protected TermRef nextSeekTerm(final boolean enumExhausted) throws IOException { + if (enumExhausted) + return null; + final TermRef t = initialSeekTerm; + initialSeekTerm = null; + return t; + } - // Find the first term that matches - if (term != null) { - SeekStatus status = actualEnum.seek(term); - if (status == SeekStatus.END) { - return null; - } else { - AcceptStatus s = accept(actualEnum.term()); - if (s == AcceptStatus.NO) { - return next(); - } else if (s == AcceptStatus.END) { - return null; - } else { - return actualEnum.term(); - } - } - } else { - return next(); - } + /** returns the field this TermsEnum is working on */ + public final String field() { + return field; } + /** + * Returns the related attributes, the returned {@link AttributeSource} + * is shared with the delegate {@code TermsEnum}. + */ @Override + public AttributeSource attributes() { + /* if we have no tenum, we return a new attributes instance, + * to prevent NPE in subclasses that use attributes. + * in all other cases we share the attributes with our delegate. */ + return (tenum == null) ? super.attributes() : tenum.attributes(); + } + + @Override public TermRef term() throws IOException { - if(actualEnum == null) { - return null; - } - return actualEnum.term(); + return (tenum == null) ? null : tenum.term(); } @Override - /** Don't call this until after setEnum, else you'll hit NPE */ public TermRef.Comparator getTermComparator() throws IOException { - return actualEnum.getTermComparator(); + return (tenum == null) ? null : tenum.getTermComparator(); } - /** - * Returns the docFreq of the current Term in the enumeration. - * Returns -1 if no Term matches or all terms have been enumerated. - */ @Override public int docFreq() { - assert actualEnum != null; - return actualEnum.docFreq(); + return (tenum == null) ? -1 : tenum.docFreq(); } - - /** Increments the enumeration to the next element. - * Non-null if one exists, or null if it's the end. */ - @Override - public TermRef next() throws IOException { - assert actualEnum != null; - while (true) { - TermRef term = actualEnum.next(); - if (term != null) { - AcceptStatus s = accept(term); - if (s == AcceptStatus.YES) { - return term; - } else if (s == AcceptStatus.END) { - // end - return null; - } - } else { - // end - return null; - } - } - } + /** This enum does not support seeking! + * @throws UnsupportedOperationException + */ @Override public SeekStatus seek(TermRef term) throws IOException { - return finishSeek(actualEnum.seek(term)); + throw new UnsupportedOperationException(getClass().getName()+" does not support seeking"); } + /** This enum does not support seeking! + * @throws UnsupportedOperationException + */ @Override public SeekStatus seek(long ord) throws IOException { - return finishSeek(actualEnum.seek(ord)); + throw new UnsupportedOperationException(getClass().getName()+" does not support seeking"); } - private SeekStatus finishSeek(SeekStatus status) throws IOException { - if (status != SeekStatus.END) { - TermRef term = actualEnum.term(); - final AcceptStatus s = accept(term); - if (s == AcceptStatus.NO) { - term = next(); - if (term == null) { - return SeekStatus.END; - } else { - return SeekStatus.NOT_FOUND; - } - } else if (s == AcceptStatus.END) { - return SeekStatus.END; - } else { - return status; - } - } else { - return status; - } - } - @Override public long ord() throws IOException { - return actualEnum.ord(); + return (tenum == null) ? -1 : tenum.ord(); } @Override public DocsEnum docs(Bits bits) throws IOException { - return actualEnum.docs(bits); + return (tenum == null) ? null : tenum.docs(bits); } + + @Override + public TermRef next() throws IOException { + if (exhausted || tenum == null) + return null; + boolean delegateExhausted = false; + for (;;) { + // Seek or forward the iterator + final TermRef term; + if (doSeek) { + final TermRef t = nextSeekTerm(delegateExhausted); + if (t == null) { + // no more terms to seek we must end now! + exhausted = true; + return null; + } + if (tenum.seek(t) == SeekStatus.END) { + // enum exhausted, seek to next one + delegateExhausted = true; + continue; + } + delegateExhausted = doSeek = false; + term = tenum.term(); + } else { + term = tenum.next(); + if (term == null) { + // enum exhausted + delegateExhausted = doSeek = true; + continue; + } + } + + // check if term is accepted + switch (accept(term)) { + case YES_AND_SEEK: + doSeek = true; + // term accepted, but we need to seek so fall-through + case YES: + // term accepted + return term; + case NO_AND_SEEK: + // invalid term, seek next time + doSeek = true; + break; + case END: + // we are supposed to end the enum + exhausted = true; + return null; + } + } + } + } Index: src/java/org/apache/lucene/search/FuzzyTermsEnum.java =================================================================== --- src/java/org/apache/lucene/search/FuzzyTermsEnum.java (revision 887534) +++ src/java/org/apache/lucene/search/FuzzyTermsEnum.java (working copy) @@ -19,7 +19,6 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermRef; import java.io.IOException; @@ -27,8 +26,9 @@ /** Subclass of FilteredTermEnum for enumerating all terms that are similar * to the specified filter term. * - *

Term enumerations are always ordered by Term.compareTo(). Each term in - * the enumeration is greater than all that precede it. + *

Term enumerations are always ordered by + * {@link #getTermComparator}. Each term in the enumeration is + * greater than all that precede it.

*/ public final class FuzzyTermsEnum extends FilteredTermsEnum { @@ -44,10 +44,8 @@ private int[][] d; private float similarity; - private final boolean empty; private Term searchTerm; - private final String field; private final String text; private final String prefix; @@ -102,7 +100,7 @@ * @throws IOException */ public FuzzyTermsEnum(IndexReader reader, Term term, final float minSimilarity, final int prefixLength) throws IOException { - super(); + super(reader, term.field()); if (minSimilarity >= 1.0f) throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal to 1"); @@ -114,7 +112,6 @@ this.minimumSimilarity = minSimilarity; this.scale_factor = 1.0f / (1.0f - minimumSimilarity); this.searchTerm = term; - this.field = searchTerm.field(); //The prefix could be longer than the word. //It's kind of silly though. It means we must match the entire word. @@ -127,21 +124,11 @@ initializeMaxDistances(); this.d = initDistanceArray(); - Terms terms = reader.fields().terms(field); - if (terms != null) { - empty = setEnum(terms.iterator(), prefixTermRef) == null; - } else { - empty = false; - } + setInitialSeekTerm(prefixTermRef); } private final TermRef prefixTermRef; - @Override - public String field() { - return field; - } - /** * The termCompare method in FuzzyTermEnum uses Levenshtein distance to * calculate the distance between the given term and the comparing term. @@ -163,11 +150,6 @@ return (float)((similarity - minimumSimilarity) * scale_factor); } - @Override - public final boolean empty() { - return empty; - } - /****************************** * Compute Levenshtein distance ******************************/ Index: src/java/org/apache/lucene/search/MultiTermQuery.java =================================================================== --- src/java/org/apache/lucene/search/MultiTermQuery.java (revision 887534) +++ src/java/org/apache/lucene/search/MultiTermQuery.java (working copy) @@ -103,22 +103,17 @@ // nocommit -- if no terms we'd want to return NullQuery BooleanQuery result = new BooleanQuery(true); - if (!termsEnum.empty()) { - final String field = termsEnum.field(); - assert field != null; - int count = 0; - TermRef term = termsEnum.term(); - // first term must exist since termsEnum wasn't null - assert term != null; - do { - TermQuery tq = new TermQuery(new Term(field, term.toString())); // found a match - tq.setBoost(query.getBoost() * termsEnum.difference()); // set the boost - result.add(tq, BooleanClause.Occur.SHOULD); // add to query - count++; - term = termsEnum.next(); - } while(term != null); - query.incTotalNumberOfTerms(count); + final String field = termsEnum.field(); + assert field != null; + int count = 0; + TermRef term; + while ((term = termsEnum.next()) != null) { + TermQuery tq = new TermQuery(new Term(field, term.toString())); // found a match + tq.setBoost(query.getBoost() * termsEnum.difference()); // set the boost + result.add(tq, BooleanClause.Occur.SHOULD); // add to query + count++; } + query.incTotalNumberOfTerms(count); return result; } else { // deprecated case @@ -167,9 +162,14 @@ private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable { @Override public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { - // strip the scores off - Query result = new ConstantScoreQuery(new QueryWrapperFilter(super.rewrite(reader, query))); - result.setBoost(query.getBoost()); + Query result = super.rewrite(reader, query); + assert result instanceof BooleanQuery; + // nocommit: if empty boolean query return NullQuery + if (!((BooleanQuery) result).clauses().isEmpty()) { + // strip the scores off + result = new ConstantScoreQuery(new QueryWrapperFilter(result)); + result.setBoost(query.getBoost()); + } return result; } @@ -248,54 +248,53 @@ // exhaust the enum before hitting either of the // cutoffs, we use ConstantBooleanQueryRewrite; else, // ConstantFilterRewrite: - final Collection pendingTerms = new ArrayList(); - final Collection oldApiPendingTerms = new ArrayList(); final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); int docVisitCount = 0; FilteredTermsEnum termsEnum = query.getTermsEnum(reader); if (termsEnum != null) { - if (!termsEnum.empty()) { - final String field = termsEnum.field(); - assert field != null; - TermRef term = termsEnum.term(); - // first term must exist since termsEnum wasn't null - assert term != null; - do { - pendingTerms.add((TermRef) term.clone()); - if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { - // Too many terms -- cut our losses now and make a filter. - Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); - result.setBoost(query.getBoost()); - return result; - } - // Loading the TermInfo from the terms dict here - // should not be costly, because 1) the - // query/filter will load the TermInfo when it - // runs, and 2) the terms dict has a cache: - docVisitCount += reader.docFreq(field, term); - term = termsEnum.next(); - } while(term != null); - - // Enumeration is done, and we hit a small - // enough number of terms & docs -- just make a - // BooleanQuery, now + final Collection pendingTerms = new ArrayList(); + final String field = termsEnum.field(); + assert field != null; + TermRef term; + while ((term = termsEnum.next()) != null) { + pendingTerms.add((TermRef) term.clone()); + if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { + // Too many terms -- cut our losses now and make a filter. + Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); + result.setBoost(query.getBoost()); + return result; + } + // Loading the TermInfo from the terms dict here + // should not be costly, because 1) the + // query/filter will load the TermInfo when it + // runs, and 2) the terms dict has a cache: + docVisitCount += reader.docFreq(field, term); + } + + // Enumeration is done, and we hit a small + // enough number of terms & docs -- just make a + // BooleanQuery, now + + // nocommit: if pendingTerms.size()==0 return NullQuery + final Query result; + if (pendingTerms.isEmpty()) { + result = new BooleanQuery(true); + } else { BooleanQuery bq = new BooleanQuery(true); for(TermRef termRef : pendingTerms) { TermQuery tq = new TermQuery(new Term(field, termRef.toString())); bq.add(tq, BooleanClause.Occur.SHOULD); } // Strip scores - Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); + result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); result.setBoost(query.getBoost()); - query.incTotalNumberOfTerms(pendingTerms.size()); - return result; - } else { - // nocommit -- need NullQuery here - return new BooleanQuery(); } + query.incTotalNumberOfTerms(pendingTerms.size()); + return result; } else { + final Collection pendingTerms = new ArrayList(); // deprecated case FilteredTermEnum enumerator = query.getEnum(reader); @@ -303,7 +302,7 @@ while(true) { Term t = enumerator.term(); if (t != null) { - oldApiPendingTerms.add(t); + pendingTerms.add(t); // Loading the TermInfo from the terms dict here // should not be costly, because 1) the // query/filter will load the TermInfo when it @@ -313,21 +312,26 @@ if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { // Too many terms -- make a filter. - Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); + Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); result.setBoost(query.getBoost()); return result; } else if (!enumerator.next()) { // Enumeration is done, and we hit a small // enough number of terms & docs -- just make a // BooleanQuery, now - BooleanQuery bq = new BooleanQuery(true); - for (final Term term: oldApiPendingTerms) { - TermQuery tq = new TermQuery(term); - bq.add(tq, BooleanClause.Occur.SHOULD); + final Query result; + if (pendingTerms.isEmpty()) { + result = new BooleanQuery(true); + } else { + BooleanQuery bq = new BooleanQuery(true); + for(Term term : pendingTerms) { + TermQuery tq = new TermQuery(term); + bq.add(tq, BooleanClause.Occur.SHOULD); + } + // Strip scores + result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); + result.setBoost(query.getBoost()); } - // Strip scores - Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); - result.setBoost(query.getBoost()); query.incTotalNumberOfTerms(pendingTerms.size()); return result; } Index: src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java =================================================================== --- src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (revision 887534) +++ src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (working copy) @@ -106,14 +106,14 @@ public DocIdSet getDocIdSet(IndexReader reader) throws IOException { final FilteredTermsEnum termsEnum = query.getTermsEnum(reader); if (termsEnum != null) { - if (!termsEnum.empty()) { + if (termsEnum.next() != null) { // fill into a OpenBitSet final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); final int[] docs = new int[32]; final int[] freqs = new int[32]; int termCount = 0; final Bits delDocs = reader.getDeletedDocs(); - while (true) { + do { termCount++; // System.out.println(" iter termCount=" + termCount + " term=" + // enumerator.term().toBytesString()); @@ -128,13 +128,7 @@ break; } } - TermRef term = termsEnum.next(); - if (term == null) { - break; - } - // System.out.println(" enum next term=" + term.toBytesString()); - assert term.termEquals(termsEnum.term()); - } + } while (termsEnum.next() != null); // System.out.println(" done termCount=" + termCount); query.incTotalNumberOfTerms(termCount); Index: src/java/org/apache/lucene/search/NumericRangeQuery.java =================================================================== --- src/java/org/apache/lucene/search/NumericRangeQuery.java (revision 887534) +++ src/java/org/apache/lucene/search/NumericRangeQuery.java (working copy) @@ -28,8 +28,6 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermRef; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; /** *

A {@link Query} that matches numeric values within a @@ -163,7 +161,7 @@ assert (valSize == 32 || valSize == 64); if (precisionStep < 1) throw new IllegalArgumentException("precisionStep must be >=1"); - this.field = StringHelper.intern(field); + this.field = field; this.precisionStep = precisionStep; this.valSize = valSize; this.min = min; @@ -303,9 +301,12 @@ return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 32, min, max, minInclusive, maxInclusive); } - @Override + @Override @SuppressWarnings("unchecked") protected FilteredTermsEnum getTermsEnum(final IndexReader reader) throws IOException { - return new NumericRangeTermsEnum(reader); + // very strange: java.lang.Number itsself is not Comparable, but all subclasses used here are + return (min != null && max != null && ((Comparable) min).compareTo(max) > 0) ? + new EmptyTermsEnum(field) : + new NumericRangeTermsEnum(reader); } /** Returns the field name for this query */ @@ -344,7 +345,7 @@ if (o instanceof NumericRangeQuery) { final NumericRangeQuery q=(NumericRangeQuery)o; return ( - field==q.field && + field.equals(q.field) && (q.min == null ? min == null : q.min.equals(min)) && (q.max == null ? max == null : q.max.equals(max)) && minInclusive == q.minInclusive && @@ -365,15 +366,9 @@ (Boolean.valueOf(minInclusive).hashCode()^0x14fa55fb)+ (Boolean.valueOf(maxInclusive).hashCode()^0x733fa5fe); } - - // field must be interned after reading from stream - private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException { - in.defaultReadObject(); - field = StringHelper.intern(field); - } // members (package private, to be also fast accessible by NumericRangeTermEnum) - String field; + final String field; final int precisionStep, valSize; final T min, max; final boolean minInclusive,maxInclusive; @@ -390,15 +385,13 @@ */ private final class NumericRangeTermsEnum extends FilteredTermsEnum { - private final IndexReader reader; - private final LinkedList rangeBounds = new LinkedList(); private TermRef currentUpperBound = null; - private final boolean empty; + + private final LinkedList rangeBounds = new LinkedList(); private final TermRef.Comparator termComp; NumericRangeTermsEnum(final IndexReader reader) throws IOException { - this.reader = reader; - + super(reader, field); switch (valSize) { case 64: { // lower @@ -475,21 +468,7 @@ throw new IllegalArgumentException("valSize must be 32 or 64"); } - // initialize iterator - final Terms terms = reader.fields().terms(field); - if (terms != null) { - // TODO: NRQ by design relies on a specific sort - // order; I think UT8 or UTF16 would work (NRQ encodes - // to only ASCII). - termComp = terms.getTermComparator(); - actualEnum = terms.iterator(); - } else { - termComp = null; - actualEnum = null; - } - - // seek to first term - empty = next() == null; + termComp = getTermComparator(); } @Override @@ -498,53 +477,10 @@ } @Override - public boolean empty() { - return empty; - } - - @Override - protected TermRef setEnum(TermsEnum actualEnum, TermRef term) throws IOException { - throw new UnsupportedOperationException("not implemented"); - } - - @Override - public SeekStatus seek(TermRef term) throws IOException { - throw new UnsupportedOperationException("not implemented"); - } - - @Override - public SeekStatus seek(long ord) throws IOException { - throw new UnsupportedOperationException("not implemented"); - } - - @Override - public String field() { - return field; - } - - @Override - protected AcceptStatus accept(TermRef term) { - return (termComp.compare(term, currentUpperBound) <= 0) ? - AcceptStatus.YES : AcceptStatus.NO; - } - - @Override - public TermRef next() throws IOException { - if (actualEnum == null) { + protected final TermRef nextSeekTerm(final boolean enumExhausted) throws IOException { + if (enumExhausted) return null; - } - - // try change to next term, if no such term exists, fall-through - // (we can only do this if the enum was already seeked) - if (currentUpperBound != null) { - final TermRef term = actualEnum.next(); - if (term != null && accept(term) == AcceptStatus.YES) { - return term; - } - } - - // if all above fails, we seek forward - while (rangeBounds.size() >= 2) { + if (rangeBounds.size() >= 2) { assert rangeBounds.size() % 2 == 0; final TermRef lowerBound = new TermRef(rangeBounds.removeFirst()); @@ -552,22 +488,19 @@ "The current upper bound must be <= the new lower bound"; this.currentUpperBound = new TermRef(rangeBounds.removeFirst()); - - SeekStatus status = actualEnum.seek(lowerBound); - if (status == SeekStatus.END) { - return null; - } - - final TermRef term = actualEnum.term(); - if (accept(term) == AcceptStatus.YES) { - return term; - } + return lowerBound; } // no more sub-range enums available assert rangeBounds.size() == 0; return null; } + + @Override + protected AcceptStatus accept(TermRef term) { + return (currentUpperBound != null && termComp.compare(term, currentUpperBound) <= 0) ? + AcceptStatus.YES : AcceptStatus.NO_AND_SEEK; + } } Index: src/java/org/apache/lucene/search/PrefixTermsEnum.java =================================================================== --- src/java/org/apache/lucene/search/PrefixTermsEnum.java (revision 887534) +++ src/java/org/apache/lucene/search/PrefixTermsEnum.java (working copy) @@ -21,50 +21,31 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermRef; /** * Subclass of FilteredTermEnum for enumerating all terms that match the * specified prefix filter term. - *

- * Term enumerations are always ordered by Term.compareTo(). Each term in - * the enumeration is greater than all that precede it. - * + *

Term enumerations are always ordered by + * {@link #getTermComparator}. Each term in the enumeration is + * greater than all that precede it.

*/ public class PrefixTermsEnum extends FilteredTermsEnum { private final Term prefix; private final TermRef prefixRef; - private final boolean empty; public PrefixTermsEnum(IndexReader reader, Term prefix) throws IOException { + super(reader, prefix.field()); this.prefix = prefix; - Terms terms = reader.fields().terms(prefix.field()); - if (terms != null) { - prefixRef = new TermRef(prefix.text()); - empty = setEnum(terms.iterator(), prefixRef) == null; - } else { - empty = true; - prefixRef = null; - } + setInitialSeekTerm(prefixRef = new TermRef(prefix.text())); } @Override - public String field() { - return prefix.field(); - } - - @Override public float difference() { return 1.0f; } - @Override - public boolean empty() { - return empty; - } - protected Term getPrefixTerm() { return prefix; } Index: src/java/org/apache/lucene/search/RegexpQuery.java =================================================================== --- src/java/org/apache/lucene/search/RegexpQuery.java (revision 0) +++ src/java/org/apache/lucene/search/RegexpQuery.java (revision 0) @@ -0,0 +1,105 @@ +package org.apache.lucene.search; + +import java.io.IOException; + +import org.apache.lucene.index.Term; + +import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.AutomatonProvider; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A fast regular expression query based on the + * {@link org.apache.lucene.util.automaton} package. + *
    + *
  • Comparisons are fast + *
  • The term dictionary is enumerated in an intelligent way, to avoid + * comparisons. See {@link AutomatonQuery} for more details. + *
+ *

+ * The supported syntax is documented in the {@link RegExp} class. + * Note this might be different than other regular expression implementations. + * For some alternatives with different syntax, look under contrib/regex + *

+ *

+ * Note this query can be slow, as it needs to iterate over many terms. In order + * to prevent extremely slow RegexpQueries, a Regexp term should not start with + * the expression .* + * + * @see RegExp + */ +public class RegexpQuery extends AutomatonQuery { + /** + * A provider that provides no named automata + */ + private static AutomatonProvider defaultProvider = new AutomatonProvider() { + public Automaton getAutomaton(String name) throws IOException { + return null; + } + }; + + /** + * Constructs a query for terms matching term. + *

+ * By default, all regular expression features are enabled. + *

+ * + * @param term regular expression. + */ + public RegexpQuery(Term term) { + this(term, RegExp.ALL); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param flags optional RegExp features from {@link RegExp} + */ + public RegexpQuery(Term term, int flags) { + this(term, flags, defaultProvider); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param flags optional RegExp features from {@link RegExp} + * @param provider custom AutomatonProvider for named automata + */ + public RegexpQuery(Term term, int flags, AutomatonProvider provider) { + super(term, new RegExp(term.text(), flags).toAutomaton(provider)); + } + + /** Prints a user-readable version of this query. */ + @Override + public String toString(String field) { + StringBuilder buffer = new StringBuilder(); + if (!term.field().equals(field)) { + buffer.append(term.field()); + buffer.append(":"); + } + buffer.append(term.text()); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } +} Property changes on: src\java\org\apache\lucene\search\RegexpQuery.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/search/SingleTermsEnum.java =================================================================== --- src/java/org/apache/lucene/search/SingleTermsEnum.java (revision 887534) +++ src/java/org/apache/lucene/search/SingleTermsEnum.java (working copy) @@ -31,10 +31,8 @@ * but want to preserve MultiTermQuery semantics such as * {@link MultiTermQuery#rewriteMethod}. */ -public class SingleTermsEnum extends FilteredTermsEnum { - private final Term singleTerm; +public final class SingleTermsEnum extends FilteredTermsEnum { private final TermRef singleRef; - private final boolean empty; /** * Creates a new SingleTermsEnum. @@ -43,38 +41,18 @@ * if it exists. */ public SingleTermsEnum(IndexReader reader, Term singleTerm) throws IOException { - this.singleTerm = singleTerm; - Terms terms = reader.fields().terms(singleTerm.field()); - if (terms != null) { - singleRef = new TermRef(singleTerm.text()); - empty = setEnum(terms.iterator(), singleRef) == null; - } else { - empty = true; - singleRef = null; - } + super(reader, singleTerm.field()); + singleRef = new TermRef(singleTerm.text()); + setInitialSeekTerm(singleRef); } @Override protected AcceptStatus accept(TermRef term) { - if (term.equals(singleRef)) { - return AcceptStatus.YES; - } else { - return AcceptStatus.END; - } + return term.equals(singleRef) ? AcceptStatus.YES : AcceptStatus.END; } @Override public float difference() { return 1.0F; } - - @Override - public boolean empty() { - return empty; - } - - @Override - public String field() { - return singleTerm.field(); - } } Index: src/java/org/apache/lucene/search/TermRangeQuery.java =================================================================== --- src/java/org/apache/lucene/search/TermRangeQuery.java (revision 887534) +++ src/java/org/apache/lucene/search/TermRangeQuery.java (working copy) @@ -142,10 +142,10 @@ @Override protected FilteredTermsEnum getTermsEnum(IndexReader reader) throws IOException { - return new TermRangeTermsEnum(reader, field, - lowerTerm, upperTerm, - includeLower, includeUpper, - collator); + return (collator == null && lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) ? + new EmptyTermsEnum(field) : + new TermRangeTermsEnum(reader, field, + lowerTerm, upperTerm, includeLower, includeUpper, collator); } /** Prints a user-readable version of this query. */ Index: src/java/org/apache/lucene/search/TermRangeTermsEnum.java =================================================================== --- src/java/org/apache/lucene/search/TermRangeTermsEnum.java (revision 887534) +++ src/java/org/apache/lucene/search/TermRangeTermsEnum.java (working copy) @@ -22,28 +22,24 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermRef; -import org.apache.lucene.index.Terms; -//import org.apache.lucene.index.Term; import org.apache.lucene.util.StringHelper; /** * Subclass of FilteredTermEnum for enumerating all terms that match the * specified range parameters. - *

- * Term enumerations are always ordered by Term.compareTo(). Each term in - * the enumeration is greater than all that precede it. + *

Term enumerations are always ordered by + * {@link #getTermComparator}. Each term in the enumeration is + * greater than all that precede it.

*/ public class TermRangeTermsEnum extends FilteredTermsEnum { private Collator collator; - private String field; private String upperTermText; private String lowerTermText; private boolean includeLower; private boolean includeUpper; final private TermRef lowerTermRef; final private TermRef upperTermRef; - private final boolean empty; private final TermRef.Comparator termComp; /** @@ -75,12 +71,12 @@ */ public TermRangeTermsEnum(IndexReader reader, String field, String lowerTermText, String upperTermText, boolean includeLower, boolean includeUpper, Collator collator) throws IOException { + super(reader, field); this.collator = collator; this.upperTermText = upperTermText; this.lowerTermText = lowerTermText; this.includeLower = includeLower; this.includeUpper = includeUpper; - this.field = StringHelper.intern(field); // do a little bit of normalization... // open ended range queries should always be inclusive. @@ -97,22 +93,9 @@ upperTermRef = new TermRef(upperTermText); } - String startTermText = collator == null ? this.lowerTermText : ""; - Terms terms = reader.fields().terms(field); - - if (terms != null) { - termComp = terms.getTermComparator(); - final boolean foundFirstTerm = setEnum(terms.iterator(), new TermRef(startTermText)) != null; - - if (foundFirstTerm && collator == null && !this.includeLower && term().termEquals(lowerTermRef)) { - empty = next() == null; - } else { - empty = !foundFirstTerm; - } - } else { - empty = true; - termComp = null; - } + TermRef startTermRef = (collator == null) ? lowerTermRef : new TermRef(""); + setInitialSeekTerm(startTermRef); + termComp = getTermComparator(); } @Override @@ -121,18 +104,10 @@ } @Override - public boolean empty() { - return empty; - } - - @Override - public String field() { - return field; - } - - @Override protected AcceptStatus accept(TermRef term) { if (collator == null) { + if (!this.includeLower && term.equals(lowerTermRef)) + return AcceptStatus.NO; // Use this field's default sort ordering if (upperTermRef != null) { final int cmp = termComp.compare(upperTermRef, term); Index: src/java/org/apache/lucene/search/WildcardQuery.java =================================================================== --- src/java/org/apache/lucene/search/WildcardQuery.java (revision 887534) +++ src/java/org/apache/lucene/search/WildcardQuery.java (working copy) @@ -17,76 +17,67 @@ * limitations under the License. */ -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; -import java.io.IOException; +import java.util.List; +import java.util.ArrayList; /** Implements the wildcard search query. Supported wildcards are *, which * matches any character sequence (including the empty one), and ?, * which matches any single character. Note this query can be slow, as it * needs to iterate over many terms. In order to prevent extremely slow WildcardQueries, - * a Wildcard term should not start with one of the wildcards * or - * ?. + * a Wildcard term should not start with the wildcard *. * *

This query uses the {@link * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} * rewrite method. * * @see WildcardTermEnums */ -public class WildcardQuery extends MultiTermQuery { - private boolean termContainsWildcard; - private boolean termIsPrefix; - protected Term term; - - public WildcardQuery(Term term) { - this.term = term; - String text = term.text(); - this.termContainsWildcard = (text.indexOf('*') != -1) - || (text.indexOf('?') != -1); - this.termIsPrefix = termContainsWildcard - && (text.indexOf('?') == -1) - && (text.indexOf('*') == text.length() - 1); - } - - @Override - protected FilteredTermsEnum getTermsEnum(IndexReader reader) throws IOException { - if (termContainsWildcard) - return new WildcardTermsEnum(reader, getTerm()); - else - return new SingleTermsEnum(reader, getTerm()); - } - - // @deprecated see getTermsEnum - @Override - protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { - if (termContainsWildcard) - return new WildcardTermEnum(reader, getTerm()); - else - return new SingleTermEnum(reader, getTerm()); - } - - /** - * Returns the pattern term. - */ - public Term getTerm() { - return term; - } - @Override - public Query rewrite(IndexReader reader) throws IOException { - if (termIsPrefix) { - MultiTermQuery rewritten = new PrefixQuery(term.createTerm(term.text() - .substring(0, term.text().indexOf('*')))); - rewritten.setBoost(getBoost()); - rewritten.setRewriteMethod(getRewriteMethod()); - return rewritten; - } else { - return super.rewrite(reader); - } - } - + public class WildcardQuery extends AutomatonQuery { + + /** String equality with support for wildcards */ + public static final char WILDCARD_STRING = '*'; + + /** Char equality with support for wildcards */ + public static final char WILDCARD_CHAR = '?'; + + /** + * Constructs a query for terms matching term. + */ + public WildcardQuery(Term term) { + super(term, toAutomaton(term)); + } + + /** + * Convert Lucene wildcard syntax into an automaton. + */ + static Automaton toAutomaton(Term wildcardquery) { + List automata = new ArrayList(); + + String wildcardText = wildcardquery.text(); + + for (int i = 0; i < wildcardText.length(); i++) { + final char c = wildcardText.charAt(i); + switch(c) { + case WILDCARD_STRING: + automata.add(BasicAutomata.makeAnyString()); + break; + case WILDCARD_CHAR: + automata.add(BasicAutomata.makeAnyChar()); + break; + default: + automata.add(BasicAutomata.makeChar(c)); + } + } + + return BasicOperations.concatenate(automata); + } + /** Prints a user-readable version of this query. */ @Override public String toString(String field) { @@ -99,30 +90,4 @@ buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } - - @Override - public int hashCode() { - final int prime = 31; - int result = super.hashCode(); - result = prime * result + ((term == null) ? 0 : term.hashCode()); - return result; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (!super.equals(obj)) - return false; - if (getClass() != obj.getClass()) - return false; - WildcardQuery other = (WildcardQuery) obj; - if (term == null) { - if (other.term != null) - return false; - } else if (!term.equals(other.term)) - return false; - return true; - } - } Index: src/java/org/apache/lucene/search/WildcardTermEnum.java =================================================================== --- src/java/org/apache/lucene/search/WildcardTermEnum.java (revision 887534) +++ src/java/org/apache/lucene/search/WildcardTermEnum.java (working copy) @@ -30,13 +30,7 @@ * the enumeration is greater than all that precede it. * @deprecated Please use {@link WildcardTermsEnum} instead. */ -public class WildcardTermEnum extends FilteredTermEnum { - final Term searchTerm; - final String field; - final String text; - final String pre; - final int preLen; - boolean endEnum = false; +public class WildcardTermEnum extends AutomatonTermEnum { /** * Creates a new WildcardTermEnum. @@ -45,60 +39,21 @@ * valid term if such a term exists. */ public WildcardTermEnum(IndexReader reader, Term term) throws IOException { - super(); - searchTerm = term; - field = searchTerm.field(); - final String searchTermText = searchTerm.text(); - - final int sidx = searchTermText.indexOf(WILDCARD_STRING); - final int cidx = searchTermText.indexOf(WILDCARD_CHAR); - int idx = sidx; - if (idx == -1) { - idx = cidx; - } - else if (cidx >= 0) { - idx = Math.min(idx, cidx); - } - pre = idx != -1?searchTerm.text().substring(0,idx): ""; - - preLen = pre.length(); - text = searchTermText.substring(preLen); - setEnum(reader.terms(new Term(searchTerm.field(), pre))); + super(WildcardQuery.toAutomaton(term), term, reader); } - @Override - protected final boolean termCompare(Term term) { - if (field == term.field()) { - String searchText = term.text(); - if (searchText.startsWith(pre)) { - return wildcardEquals(text, 0, searchText, preLen); - } - } - endEnum = true; - return false; - } + /** String equality with support for wildcards */ + public static final char WILDCARD_STRING = WildcardQuery.WILDCARD_STRING; - @Override - public float difference() { - return 1.0f; - } + /** Char equality with support for wildcards */ + public static final char WILDCARD_CHAR = WildcardQuery.WILDCARD_CHAR; - @Override - public final boolean endEnum() { - return endEnum; - } - - /******************************************** - * String equality with support for wildcards - ********************************************/ - - public static final char WILDCARD_STRING = '*'; - public static final char WILDCARD_CHAR = '?'; - /** * Determines if a word matches a wildcard pattern. * Work released by Granta Design Ltd after originally being done on * company time. + *

Note: This method is no longer used by this class! + * It is dead code and only available for backwards compatibility. */ public static final boolean wildcardEquals(String pattern, int patternIdx, String string, int stringIdx) Index: src/java/org/apache/lucene/search/WildcardTermsEnum.java =================================================================== --- src/java/org/apache/lucene/search/WildcardTermsEnum.java (revision 887534) +++ src/java/org/apache/lucene/search/WildcardTermsEnum.java (working copy) @@ -35,11 +35,9 @@ */ public class WildcardTermsEnum extends FilteredTermsEnum { final Term searchTerm; - final String field; final String text; final String pre; final int preLen; - private final boolean empty; private final TermRef preTermRef; /** @@ -49,9 +47,8 @@ * valid term if such a term exists. */ public WildcardTermsEnum(IndexReader reader, Term term) throws IOException { - super(); - searchTerm = term; - field = searchTerm.field(); + super(reader, term.field()); + this.searchTerm = term; final String searchTermText = searchTerm.text(); final int sidx = searchTermText.indexOf(WILDCARD_STRING); @@ -67,22 +64,10 @@ preLen = pre.length(); text = searchTermText.substring(preLen); - preTermRef = new TermRef(pre); - - Terms terms = reader.fields().terms(searchTerm.field()); - if (terms != null) { - empty = setEnum(terms.iterator(), preTermRef) == null; - } else { - empty = true; - } + setInitialSeekTerm(preTermRef = new TermRef(pre)); } @Override - public String field() { - return searchTerm.field(); - } - - @Override protected final AcceptStatus accept(TermRef term) { if (term.startsWith(preTermRef)) { // TODO: would be better, but trickier, to not have to @@ -104,11 +89,6 @@ return 1.0f; } - @Override - public final boolean empty() { - return empty; - } - /******************************************** * String equality with support for wildcards ********************************************/ Index: src/java/org/apache/lucene/util/automaton/Automaton.java =================================================================== --- src/java/org/apache/lucene/util/automaton/Automaton.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/Automaton.java (revision 0) @@ -0,0 +1,748 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +/** + * Finite-state automaton with regular expression operations. + *

+ * Class invariants: + *

    + *
  • An automaton is either represented explicitly (with {@link State} and + * {@link Transition} objects) or with a singleton string (see + * {@link #getSingleton()} and {@link #expandSingleton()}) in case the automaton + * is known to accept exactly one string. (Implicitly, all states and + * transitions of an automaton are reachable from its initial state.) + *
  • Automata are always reduced (see {@link #reduce()}) and have no + * transitions to dead states (see {@link #removeDeadTransitions()}). + *
  • If an automaton is nondeterministic, then {@link #isDeterministic()} + * returns false (but the converse is not required). + *
  • Automata provided as input to operations are generally assumed to be + * disjoint. + *
+ *

+ * If the states or transitions are manipulated manually, the + * {@link #restoreInvariant()} and {@link #setDeterministic(boolean)} methods + * should be used afterwards to restore representation invariants that are + * assumed by the built-in automata operations. + * + *

+ * WARNING: The status of the Automaton feature is + * experimental. The APIs introduced here might change in the future and will + * not be supported anymore in such a case. + */ +public class Automaton implements Serializable, Cloneable { + + static final long serialVersionUID = 10001; + + /** + * Minimize using Hopcroft's O(n log n) algorithm. This is regarded as one of + * the most generally efficient algorithms that exist. + * + * @see #setMinimization(int) + */ + public static final int MINIMIZE_HOPCROFT = 2; + + /** Selects minimization algorithm (default: MINIMIZE_HOPCROFT). */ + static int minimization = MINIMIZE_HOPCROFT; + + /** Initial state of this automaton. */ + State initial; + + /** + * If true, then this automaton is definitely deterministic (i.e., there are + * no choices for any run, but a run may crash). + */ + boolean deterministic; + + /** Extra data associated with this automaton. */ + transient Object info; + + /** + * Hash code. Recomputed by {@link MinimizationOperations#minimize(Automaton)} + */ + int hash_code; + + /** Singleton string. Null if not applicable. */ + String singleton; + + /** Minimize always flag. */ + static boolean minimize_always = false; + + /** + * Selects whether operations may modify the input automata (default: + * false). + */ + static boolean allow_mutation = false; + + /** + * Constructs a new automaton that accepts the empty language. Using this + * constructor, automata can be constructed manually from {@link State} and + * {@link Transition} objects. + * + * @see #setInitialState(State) + * @see State + * @see Transition + */ + public Automaton() { + initial = new State(); + deterministic = true; + singleton = null; + } + + boolean isDebug() { + return System.getProperty("dk.brics.automaton.debug") != null; + } + + /** + * Selects minimization algorithm (default: MINIMIZE_HOPCROFT). + * + * @param algorithm minimization algorithm + */ + static public void setMinimization(int algorithm) { + minimization = algorithm; + } + + /** + * Sets or resets minimize always flag. If this flag is set, then + * {@link MinimizationOperations#minimize(Automaton)} will automatically be + * invoked after all operations that otherwise may produce non-minimal + * automata. By default, the flag is not set. + * + * @param flag if true, the flag is set + */ + static public void setMinimizeAlways(boolean flag) { + minimize_always = flag; + } + + /** + * Sets or resets allow mutate flag. If this flag is set, then all automata + * operations may modify automata given as input; otherwise, operations will + * always leave input automata languages unmodified. By default, the flag is + * not set. + * + * @param flag if true, the flag is set + * @return previous value of the flag + */ + static public boolean setAllowMutate(boolean flag) { + boolean b = allow_mutation; + allow_mutation = flag; + return b; + } + + /** + * Returns the state of the allow mutate flag. If this flag is set, then all + * automata operations may modify automata given as input; otherwise, + * operations will always leave input automata languages unmodified. By + * default, the flag is not set. + * + * @return current value of the flag + */ + static boolean getAllowMutate() { + return allow_mutation; + } + + void checkMinimizeAlways() { + if (minimize_always) MinimizationOperations.minimize(this); + } + + boolean isSingleton() { + return singleton != null; + } + + /** + * Returns the singleton string for this automaton. An automaton that accepts + * exactly one string may be represented in singleton mode. In that + * case, this method may be used to obtain the string. + * + * @return string, null if this automaton is not in singleton mode. + */ + public String getSingleton() { + return singleton; + } + + /** + * Sets initial state. + * + * @param s state + */ + public void setInitialState(State s) { + initial = s; + singleton = null; + } + + /** + * Gets initial state. + * + * @return state + */ + public State getInitialState() { + expandSingleton(); + return initial; + } + + /** + * Returns deterministic flag for this automaton. + * + * @return true if the automaton is definitely deterministic, false if the + * automaton may be nondeterministic + */ + public boolean isDeterministic() { + return deterministic; + } + + /** + * Sets deterministic flag for this automaton. This method should (only) be + * used if automata are constructed manually. + * + * @param deterministic true if the automaton is definitely deterministic, + * false if the automaton may be nondeterministic + */ + public void setDeterministic(boolean deterministic) { + this.deterministic = deterministic; + } + + /** + * Associates extra information with this automaton. + * + * @param info extra information + */ + public void setInfo(Object info) { + this.info = info; + } + + /** + * Returns extra information associated with this automaton. + * + * @return extra information + * @see #setInfo(Object) + */ + public Object getInfo() { + return info; + } + + /** + * Returns the set of states that are reachable from the initial state. + * + * @return set of {@link State} objects + */ + public Set getStates() { + expandSingleton(); + Set visited; + if (isDebug()) visited = new LinkedHashSet(); + else visited = new HashSet(); + LinkedList worklist = new LinkedList(); + worklist.add(initial); + visited.add(initial); + while (worklist.size() > 0) { + State s = worklist.removeFirst(); + Collection tr; + if (isDebug()) tr = s.getSortedTransitions(false); + else tr = s.transitions; + for (Transition t : tr) + if (!visited.contains(t.to)) { + visited.add(t.to); + worklist.add(t.to); + } + } + return visited; + } + + /** + * Returns the set of reachable accept states. + * + * @return set of {@link State} objects + */ + public Set getAcceptStates() { + expandSingleton(); + HashSet accepts = new HashSet(); + HashSet visited = new HashSet(); + LinkedList worklist = new LinkedList(); + worklist.add(initial); + visited.add(initial); + while (worklist.size() > 0) { + State s = worklist.removeFirst(); + if (s.accept) accepts.add(s); + for (Transition t : s.transitions) + if (!visited.contains(t.to)) { + visited.add(t.to); + worklist.add(t.to); + } + } + return accepts; + } + + /** + * Assigns consecutive numbers to the given states. + */ + static void setStateNumbers(Set states) { + int number = 0; + for (State s : states) + s.number = number++; + } + + /** + * Adds transitions to explicit crash state to ensure that transition function + * is total. + */ + void totalize() { + State s = new State(); + s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + s)); + for (State p : getStates()) { + int maxi = Character.MIN_VALUE; + for (Transition t : p.getSortedTransitions(false)) { + if (t.min > maxi) p.transitions.add(new Transition((char) maxi, + (char) (t.min - 1), s)); + if (t.max + 1 > maxi) maxi = t.max + 1; + } + if (maxi <= Character.MAX_VALUE) p.transitions.add(new Transition( + (char) maxi, Character.MAX_VALUE, s)); + } + } + + /** + * Restores representation invariant. This method must be invoked before any + * built-in automata operation is performed if automaton states or transitions + * are manipulated manually. + * + * @see #setDeterministic(boolean) + */ + public void restoreInvariant() { + removeDeadTransitions(); + } + + /** + * Reduces this automaton. An automaton is "reduced" by combining overlapping + * and adjacent edge intervals with same destination. + */ + public void reduce() { + if (isSingleton()) return; + Set states = getStates(); + setStateNumbers(states); + for (State s : states) { + List st = s.getSortedTransitions(true); + s.resetTransitions(); + State p = null; + int min = -1, max = -1; + for (Transition t : st) { + if (p == t.to) { + if (t.min <= max + 1) { + if (t.max > max) max = t.max; + } else { + if (p != null) s.transitions.add(new Transition((char) min, + (char) max, p)); + min = t.min; + max = t.max; + } + } else { + if (p != null) s.transitions.add(new Transition((char) min, + (char) max, p)); + p = t.to; + min = t.min; + max = t.max; + } + } + if (p != null) s.transitions + .add(new Transition((char) min, (char) max, p)); + } + } + + /** + * Returns sorted array of all interval start points. + */ + char[] getStartPoints() { + Set pointset = new HashSet(); + for (State s : getStates()) { + pointset.add(Character.MIN_VALUE); + for (Transition t : s.transitions) { + pointset.add(t.min); + if (t.max < Character.MAX_VALUE) pointset.add((char) (t.max + 1)); + } + } + char[] points = new char[pointset.size()]; + int n = 0; + for (Character m : pointset) + points[n++] = m; + Arrays.sort(points); + return points; + } + + /** + * Returns the set of live states. A state is "live" if an accept state is + * reachable from it. + * + * @return set of {@link State} objects + */ + public Set getLiveStates() { + expandSingleton(); + return getLiveStates(getStates()); + } + + private Set getLiveStates(Set states) { + HashMap> map = new HashMap>(); + for (State s : states) + map.put(s, new HashSet()); + for (State s : states) + for (Transition t : s.transitions) + map.get(t.to).add(s); + Set live = new HashSet(getAcceptStates()); + LinkedList worklist = new LinkedList(live); + while (worklist.size() > 0) { + State s = worklist.removeFirst(); + for (State p : map.get(s)) + if (!live.contains(p)) { + live.add(p); + worklist.add(p); + } + } + return live; + } + + /** + * Removes transitions to dead states and calls {@link #reduce()} and + * {@link #clearHashCode()}. (A state is "dead" if no accept state is + * reachable from it.) + */ + public void removeDeadTransitions() { + clearHashCode(); + if (isSingleton()) return; + Set states = getStates(); + Set live = getLiveStates(states); + for (State s : states) { + Set st = s.transitions; + s.resetTransitions(); + for (Transition t : st) + if (live.contains(t.to)) s.transitions.add(t); + } + reduce(); + } + + /** + * Returns a sorted array of transitions for each state (and sets state + * numbers). + */ + static Transition[][] getSortedTransitions(Set states) { + setStateNumbers(states); + Transition[][] transitions = new Transition[states.size()][]; + for (State s : states) + transitions[s.number] = s.getSortedTransitionArray(false); + return transitions; + } + + /** + * Expands singleton representation to normal representation. Does nothing if + * not in singleton representation. + */ + public void expandSingleton() { + if (isSingleton()) { + State p = new State(); + initial = p; + for (int i = 0; i < singleton.length(); i++) { + State q = new State(); + p.transitions.add(new Transition(singleton.charAt(i), q)); + p = q; + } + p.accept = true; + deterministic = true; + singleton = null; + } + } + + /** + * Returns the number of states in this automaton. + */ + public int getNumberOfStates() { + if (isSingleton()) return singleton.length() + 1; + return getStates().size(); + } + + /** + * Returns the number of transitions in this automaton. This number is counted + * as the total number of edges, where one edge may be a character interval. + */ + public int getNumberOfTransitions() { + if (isSingleton()) return singleton.length(); + int c = 0; + for (State s : getStates()) + c += s.transitions.size(); + return c; + } + + /** + * Returns true if the language of this automaton is equal to the language of + * the given automaton. Implemented using hashCode and + * subsetOf. + */ + @Override + public boolean equals(Object obj) { + if (obj == this) return true; + if (!(obj instanceof Automaton)) return false; + Automaton a = (Automaton) obj; + if (isSingleton() && a.isSingleton()) return singleton.equals(a.singleton); + return hashCode() == a.hashCode() && BasicOperations.subsetOf(this, a) + && BasicOperations.subsetOf(a, this); + } + + /** + * Returns hash code for this automaton. The hash code is based on the number + * of states and transitions in the minimized automaton. Invoking this method + * may involve minimizing the automaton. + */ + @Override + public int hashCode() { + if (hash_code == 0) MinimizationOperations.minimize(this); + return hash_code; + } + + /** + * Must be invoked when the stored hash code may no longer be valid. + */ + void clearHashCode() { + hash_code = 0; + } + + /** + * Returns a string representation of this automaton. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + if (isSingleton()) { + b.append("singleton: "); + for (char c : singleton.toCharArray()) + Transition.appendCharString(c, b); + b.append("\n"); + } else { + Set states = getStates(); + setStateNumbers(states); + b.append("initial state: ").append(initial.number).append("\n"); + for (State s : states) + b.append(s.toString()); + } + return b.toString(); + } + + /** + * Returns Graphviz Dot representation of this automaton. + */ + public String toDot() { + StringBuilder b = new StringBuilder("digraph Automaton {\n"); + b.append(" rankdir = LR;\n"); + Set states = getStates(); + setStateNumbers(states); + for (State s : states) { + b.append(" ").append(s.number); + if (s.accept) b.append(" [shape=doublecircle,label=\"\"];\n"); + else b.append(" [shape=circle,label=\"\"];\n"); + if (s == initial) { + b.append(" initial [shape=plaintext,label=\"\"];\n"); + b.append(" initial -> ").append(s.number).append("\n"); + } + for (Transition t : s.transitions) { + b.append(" ").append(s.number); + t.appendDot(b); + } + } + return b.append("}\n").toString(); + } + + /** + * Returns a clone of this automaton, expands if singleton. + */ + Automaton cloneExpanded() { + Automaton a = clone(); + a.expandSingleton(); + return a; + } + + /** + * Returns a clone of this automaton unless allow_mutation is + * set, expands if singleton. + */ + Automaton cloneExpandedIfRequired() { + if (allow_mutation) { + expandSingleton(); + return this; + } else return cloneExpanded(); + } + + /** + * Returns a clone of this automaton. + */ + @Override + public Automaton clone() { + try { + Automaton a = (Automaton) super.clone(); + if (!isSingleton()) { + HashMap m = new HashMap(); + Set states = getStates(); + for (State s : states) + m.put(s, new State()); + for (State s : states) { + State p = m.get(s); + p.accept = s.accept; + if (s == initial) a.initial = p; + for (Transition t : s.transitions) + p.transitions.add(new Transition(t.min, t.max, m.get(t.to))); + } + } + return a; + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + /** + * Returns a clone of this automaton, or this automaton itself if + * allow_mutation flag is set. + */ + Automaton cloneIfRequired() { + if (allow_mutation) return this; + else return clone(); + } + + /** + * See {@link BasicOperations#concatenate(Automaton, Automaton)}. + */ + public Automaton concatenate(Automaton a) { + return BasicOperations.concatenate(this, a); + } + + /** + * See {@link BasicOperations#concatenate(List)}. + */ + static public Automaton concatenate(List l) { + return BasicOperations.concatenate(l); + } + + /** + * See {@link BasicOperations#optional(Automaton)}. + */ + public Automaton optional() { + return BasicOperations.optional(this); + } + + /** + * See {@link BasicOperations#repeat(Automaton)}. + */ + public Automaton repeat() { + return BasicOperations.repeat(this); + } + + /** + * See {@link BasicOperations#repeat(Automaton, int)}. + */ + public Automaton repeat(int min) { + return BasicOperations.repeat(this, min); + } + + /** + * See {@link BasicOperations#repeat(Automaton, int, int)}. + */ + public Automaton repeat(int min, int max) { + return BasicOperations.repeat(this, min, max); + } + + /** + * See {@link BasicOperations#complement(Automaton)}. + */ + public Automaton complement() { + return BasicOperations.complement(this); + } + + /** + * See {@link BasicOperations#minus(Automaton, Automaton)}. + */ + public Automaton minus(Automaton a) { + return BasicOperations.minus(this, a); + } + + /** + * See {@link BasicOperations#intersection(Automaton, Automaton)}. + */ + public Automaton intersection(Automaton a) { + return BasicOperations.intersection(this, a); + } + + /** + * See {@link BasicOperations#subsetOf(Automaton, Automaton)}. + */ + public boolean subsetOf(Automaton a) { + return BasicOperations.subsetOf(this, a); + } + + /** + * See {@link BasicOperations#union(Automaton, Automaton)}. + */ + public Automaton union(Automaton a) { + return BasicOperations.union(this, a); + } + + /** + * See {@link BasicOperations#union(Collection)}. + */ + static public Automaton union(Collection l) { + return BasicOperations.union(l); + } + + /** + * See {@link BasicOperations#determinize(Automaton)}. + */ + public void determinize() { + BasicOperations.determinize(this); + } + + /** + * See {@link BasicOperations#isEmptyString(Automaton)}. + */ + public boolean isEmptyString() { + return BasicOperations.isEmptyString(this); + } + + /** + * See {@link MinimizationOperations#minimize(Automaton)}. Returns the + * automaton being given as argument. + */ + public static Automaton minimize(Automaton a) { + MinimizationOperations.minimize(a); + return a; + } +} Property changes on: src\java\org\apache\lucene\util\automaton\Automaton.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/AutomatonProvider.java =================================================================== --- src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (revision 0) @@ -0,0 +1,53 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.IOException; + +/** + * Automaton provider for RegExp. + * {@link RegExp#toAutomaton(AutomatonProvider)} + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public interface AutomatonProvider { + + /** + * Returns automaton of the given name. + * + * @param name automaton name + * @return automaton + * @throws IOException if errors occur + */ + public Automaton getAutomaton(String name) throws IOException; +} Property changes on: src\java\org\apache\lucene\util\automaton\AutomatonProvider.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/BasicAutomata.java =================================================================== --- src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 0) @@ -0,0 +1,482 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; + +/** + * Construction of basic automata. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +final public class BasicAutomata { + // used by getWhitespaceAutomaton to match basic whitespace + private static final Automaton ws = Automaton.minimize(BasicAutomata + .makeCharSet(" \t\n\r").repeat()); + + private BasicAutomata() {} + + /** + * Returns a new (deterministic) automaton with the empty language. + */ + public static Automaton makeEmpty() { + Automaton a = new Automaton(); + State s = new State(); + a.initial = s; + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts only the empty string. + */ + public static Automaton makeEmptyString() { + Automaton a = new Automaton(); + a.singleton = ""; + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts all strings. + */ + public static Automaton makeAnyString() { + Automaton a = new Automaton(); + State s = new State(); + a.initial = s; + s.accept = true; + s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + s)); + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts any single character. + */ + public static Automaton makeAnyChar() { + return makeCharRange(Character.MIN_VALUE, Character.MAX_VALUE); + } + + /** + * Returns a new (deterministic) automaton that accepts a single character of + * the given value. + */ + public static Automaton makeChar(char c) { + Automaton a = new Automaton(); + a.singleton = Character.toString(c); + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts a single char whose + * value is in the given interval (including both end points). + */ + public static Automaton makeCharRange(char min, char max) { + if (min == max) return makeChar(min); + Automaton a = new Automaton(); + State s1 = new State(); + State s2 = new State(); + a.initial = s1; + s2.accept = true; + if (min <= max) s1.transitions.add(new Transition(min, max, s2)); + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts a single character in + * the given set. + */ + public static Automaton makeCharSet(String set) { + if (set.length() == 1) return makeChar(set.charAt(0)); + Automaton a = new Automaton(); + State s1 = new State(); + State s2 = new State(); + a.initial = s1; + s2.accept = true; + for (int i = 0; i < set.length(); i++) + s1.transitions.add(new Transition(set.charAt(i), s2)); + a.deterministic = true; + a.reduce(); + return a; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of length + * x.substring(n).length(). + */ + private static State anyOfRightLength(String x, int n) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else s.addTransition(new Transition('0', '9', anyOfRightLength(x, n + 1))); + return s; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of value at least + * x.substring(n) and length x.substring(n).length(). + */ + private static State atLeast(String x, int n, Collection initials, + boolean zeros) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else { + if (zeros) initials.add(s); + char c = x.charAt(n); + s.addTransition(new Transition(c, atLeast(x, n + 1, initials, zeros + && c == '0'))); + if (c < '9') s.addTransition(new Transition((char) (c + 1), '9', + anyOfRightLength(x, n + 1))); + } + return s; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of value at most + * x.substring(n) and length x.substring(n).length(). + */ + private static State atMost(String x, int n) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else { + char c = x.charAt(n); + s.addTransition(new Transition(c, atMost(x, (char) n + 1))); + if (c > '0') s.addTransition(new Transition('0', (char) (c - 1), + anyOfRightLength(x, n + 1))); + } + return s; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of value between + * x.substring(n) and y.substring(n) and of length x.substring(n).length() + * (which must be equal to y.substring(n).length()). + */ + private static State between(String x, String y, int n, + Collection initials, boolean zeros) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else { + if (zeros) initials.add(s); + char cx = x.charAt(n); + char cy = y.charAt(n); + if (cx == cy) s.addTransition(new Transition(cx, between(x, y, n + 1, + initials, zeros && cx == '0'))); + else { // cx0, use fixed number of digits (strings must be prefixed + * by 0's to obtain the right length) - otherwise, the number of + * digits is not fixed + * @exception IllegalArgumentException if min>max or if numbers in the + * interval cannot be expressed with the given fixed number of + * digits + */ + public static Automaton makeInterval(int min, int max, int digits) + throws IllegalArgumentException { + Automaton a = new Automaton(); + String x = Integer.toString(min); + String y = Integer.toString(max); + if (min > max || (digits > 0 && y.length() > digits)) throw new IllegalArgumentException(); + int d; + if (digits > 0) d = digits; + else d = y.length(); + StringBuilder bx = new StringBuilder(); + for (int i = x.length(); i < d; i++) + bx.append('0'); + bx.append(x); + x = bx.toString(); + StringBuilder by = new StringBuilder(); + for (int i = y.length(); i < d; i++) + by.append('0'); + by.append(y); + y = by.toString(); + Collection initials = new ArrayList(); + a.initial = between(x, y, 0, initials, digits <= 0); + if (digits <= 0) { + ArrayList pairs = new ArrayList(); + for (State p : initials) + if (a.initial != p) pairs.add(new StatePair(a.initial, p)); + BasicOperations.addEpsilons(a, pairs); + a.initial.addTransition(new Transition('0', a.initial)); + a.deterministic = false; + } else a.deterministic = true; + a.checkMinimizeAlways(); + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts the single given + * string. + */ + public static Automaton makeString(String s) { + Automaton a = new Automaton(); + a.singleton = s; + a.deterministic = true; + return a; + } + + /** + * Constructs automaton that accept strings representing nonnegative integers + * that are not larger than the given value. + * + * @param n string representation of maximum value + */ + public static Automaton makeMaxInteger(String n) { + int i = 0; + while (i < n.length() && n.charAt(i) == '0') + i++; + StringBuilder b = new StringBuilder(); + b.append("0*(0|"); + if (i < n.length()) b.append("[0-9]{1," + (n.length() - i - 1) + "}|"); + maxInteger(n.substring(i), 0, b); + b.append(")"); + return Automaton.minimize((new RegExp(b.toString())).toAutomaton()); + } + + private static void maxInteger(String n, int i, StringBuilder b) { + b.append('('); + if (i < n.length()) { + char c = n.charAt(i); + if (c != '0') b.append("[0-" + (char) (c - 1) + "][0-9]{" + + (n.length() - i - 1) + "}|"); + b.append(c); + maxInteger(n, i + 1, b); + } + b.append(')'); + } + + /** + * Constructs automaton that accept strings representing nonnegative integers + * that are not less that the given value. + * + * @param n string representation of minimum value + */ + public static Automaton makeMinInteger(String n) { + int i = 0; + while (i + 1 < n.length() && n.charAt(i) == '0') + i++; + StringBuilder b = new StringBuilder(); + b.append("0*"); + minInteger(n.substring(i), 0, b); + b.append("[0-9]*"); + return Automaton.minimize((new RegExp(b.toString())).toAutomaton()); + } + + private static void minInteger(String n, int i, StringBuilder b) { + b.append('('); + if (i < n.length()) { + char c = n.charAt(i); + if (c != '9') b.append("[" + (char) (c + 1) + "-9][0-9]{" + + (n.length() - i - 1) + "}|"); + b.append(c); + minInteger(n, i + 1, b); + } + b.append(')'); + } + + /** + * Constructs automaton that accept strings representing decimal numbers that + * can be written with at most the given number of digits. Surrounding + * whitespace is permitted. + * + * @param i max number of necessary digits + */ + public static Automaton makeTotalDigits(int i) { + return Automaton.minimize((new RegExp("[ \t\n\r]*[-+]?0*([0-9]{0," + i + + "}|((([0-9]\\.*){0," + i + "})&@\\.@)0*)[ \t\n\r]*")).toAutomaton()); + } + + /** + * Constructs automaton that accept strings representing decimal numbers that + * can be written with at most the given number of digits in the fraction + * part. Surrounding whitespace is permitted. + * + * @param i max number of necessary fraction digits + */ + public static Automaton makeFractionDigits(int i) { + return Automaton.minimize((new RegExp("[ \t\n\r]*[-+]?[0-9]+(\\.[0-9]{0," + + i + "}0*)?[ \t\n\r]*")).toAutomaton()); + } + + /** + * Constructs automaton that accept strings representing the given integer. + * Surrounding whitespace is permitted. + * + * @param value string representation of integer + */ + public static Automaton makeIntegerValue(String value) { + boolean minus = false; + int i = 0; + while (i < value.length()) { + char c = value.charAt(i); + if (c == '-') minus = true; + if (c >= '1' && c <= '9') break; + i++; + } + StringBuilder b = new StringBuilder(); + b.append(value.substring(i)); + if (b.length() == 0) b.append("0"); + Automaton s; + if (minus) s = makeChar('-'); + else s = makeChar('+').optional(); + Automaton ws = getWhitespaceAutomaton(); + return Automaton.minimize(ws.concatenate( + s.concatenate(makeChar('0').repeat()).concatenate( + makeString(b.toString()))).concatenate(ws)); + } + + /** + * Constructs automaton that accept strings representing the given decimal + * number. Surrounding whitespace is permitted. + * + * @param value string representation of decimal number + */ + public static Automaton makeDecimalValue(String value) { + boolean minus = false; + int i = 0; + while (i < value.length()) { + char c = value.charAt(i); + if (c == '-') minus = true; + if ((c >= '1' && c <= '9') || c == '.') break; + i++; + } + StringBuilder b1 = new StringBuilder(); + StringBuilder b2 = new StringBuilder(); + int p = value.indexOf('.', i); + if (p == -1) b1.append(value.substring(i)); + else { + b1.append(value.substring(i, p)); + i = value.length() - 1; + while (i > p) { + char c = value.charAt(i); + if (c >= '1' && c <= '9') break; + i--; + } + b2.append(value.substring(p + 1, i + 1)); + } + if (b1.length() == 0) b1.append("0"); + Automaton s; + if (minus) s = makeChar('-'); + else s = makeChar('+').optional(); + Automaton d; + if (b2.length() == 0) d = makeChar('.') + .concatenate(makeChar('0').repeat(1)).optional(); + else d = makeChar('.').concatenate(makeString(b2.toString())).concatenate( + makeChar('0').repeat()); + Automaton ws = getWhitespaceAutomaton(); + return Automaton.minimize(ws.concatenate( + s.concatenate(makeChar('0').repeat()).concatenate( + makeString(b1.toString())).concatenate(d)).concatenate(ws)); + } + + /** + * Constructs deterministic automaton that matches strings that contain the + * given substring. + */ + public static Automaton makeStringMatcher(String s) { + Automaton a = new Automaton(); + State[] states = new State[s.length() + 1]; + states[0] = a.initial; + for (int i = 0; i < s.length(); i++) + states[i + 1] = new State(); + State f = states[s.length()]; + f.accept = true; + f.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + f)); + for (int i = 0; i < s.length(); i++) { + Set done = new HashSet(); + char c = s.charAt(i); + states[i].transitions.add(new Transition(c, states[i + 1])); + done.add(c); + for (int j = i; j >= 1; j--) { + char d = s.charAt(j - 1); + if (!done.contains(d) + && s.substring(0, j - 1).equals(s.substring(i - j + 1, i))) { + states[i].transitions.add(new Transition(d, states[j])); + done.add(d); + } + } + char[] da = new char[done.size()]; + int h = 0; + for (char w : done) + da[h++] = w; + Arrays.sort(da); + int from = Character.MIN_VALUE; + int k = 0; + while (from <= Character.MAX_VALUE) { + while (k < da.length && da[k] == from) { + k++; + from++; + } + if (from <= Character.MAX_VALUE) { + int to = Character.MAX_VALUE; + if (k < da.length) { + to = da[k] - 1; + k++; + } + states[i].transitions.add(new Transition((char) from, (char) to, + states[0])); + from = to + 2; + } + } + } + a.deterministic = true; + return a; + } + + private static Automaton getWhitespaceAutomaton() { + return ws; + } +} Property changes on: src\java\org\apache\lucene\util\automaton\BasicAutomata.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/BasicOperations.java =================================================================== --- src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 0) @@ -0,0 +1,624 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Basic automata operations. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +final public class BasicOperations { + + private BasicOperations() {} + + /** + * Returns an automaton that accepts the concatenation of the languages of the + * given automata. + *

+ * Complexity: linear in number of states. + */ + static public Automaton concatenate(Automaton a1, Automaton a2) { + if (a1.isSingleton() && a2.isSingleton()) return BasicAutomata + .makeString(a1.singleton + a2.singleton); + if (a1 == a2) { + a1 = a1.cloneExpanded(); + a2 = a2.cloneExpanded(); + } else { + a1 = a1.cloneExpandedIfRequired(); + a2 = a2.cloneExpandedIfRequired(); + } + for (State s : a1.getAcceptStates()) { + s.accept = false; + s.addEpsilon(a2.initial); + } + a1.deterministic = false; + a1.clearHashCode(); + a1.checkMinimizeAlways(); + return a1; + } + + /** + * Returns an automaton that accepts the concatenation of the languages of the + * given automata. + *

+ * Complexity: linear in total number of states. + */ + static public Automaton concatenate(List l) { + if (l.isEmpty()) return BasicAutomata.makeEmptyString(); + boolean all_singleton = true; + for (Automaton a : l) + if (!a.isSingleton()) { + all_singleton = false; + break; + } + if (all_singleton) { + StringBuilder b = new StringBuilder(); + for (Automaton a : l) + b.append(a.singleton); + return BasicAutomata.makeString(b.toString()); + } else { + for (Automaton a : l) + if (BasicOperations.isEmpty(a)) return BasicAutomata.makeEmpty(); + Set ids = new HashSet(); + for (Automaton a : l) + ids.add(System.identityHashCode(a)); + boolean has_aliases = ids.size() != l.size(); + Automaton b = l.get(0); + if (has_aliases) b = b.cloneExpanded(); + else b = b.cloneExpandedIfRequired(); + Set ac = b.getAcceptStates(); + boolean first = true; + for (Automaton a : l) + if (first) first = false; + else { + if (a.isEmptyString()) continue; + Automaton aa = a; + if (has_aliases) aa = aa.cloneExpanded(); + else aa = aa.cloneExpandedIfRequired(); + Set ns = aa.getAcceptStates(); + for (State s : ac) { + s.accept = false; + s.addEpsilon(aa.initial); + if (s.accept) ns.add(s); + } + ac = ns; + } + b.deterministic = false; + b.clearHashCode(); + b.checkMinimizeAlways(); + return b; + } + } + + /** + * Returns an automaton that accepts the union of the empty string and the + * language of the given automaton. + *

+ * Complexity: linear in number of states. + */ + static public Automaton optional(Automaton a) { + a = a.cloneExpandedIfRequired(); + State s = new State(); + s.addEpsilon(a.initial); + s.accept = true; + a.initial = s; + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + return a; + } + + /** + * Returns an automaton that accepts the Kleene star (zero or more + * concatenated repetitions) of the language of the given automaton. Never + * modifies the input automaton language. + *

+ * Complexity: linear in number of states. + */ + static public Automaton repeat(Automaton a) { + a = a.cloneExpanded(); + State s = new State(); + s.accept = true; + s.addEpsilon(a.initial); + for (State p : a.getAcceptStates()) + p.addEpsilon(s); + a.initial = s; + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + return a; + } + + /** + * Returns an automaton that accepts min or more concatenated + * repetitions of the language of the given automaton. + *

+ * Complexity: linear in number of states and in min. + */ + static public Automaton repeat(Automaton a, int min) { + if (min == 0) return repeat(a); + List as = new ArrayList(); + while (min-- > 0) + as.add(a); + as.add(repeat(a)); + return concatenate(as); + } + + /** + * Returns an automaton that accepts between min and + * max (including both) concatenated repetitions of the language + * of the given automaton. + *

+ * Complexity: linear in number of states and in min and + * max. + */ + static public Automaton repeat(Automaton a, int min, int max) { + if (min > max) return BasicAutomata.makeEmpty(); + max -= min; + a.expandSingleton(); + Automaton b; + if (min == 0) b = BasicAutomata.makeEmptyString(); + else if (min == 1) b = a.clone(); + else { + List as = new ArrayList(); + while (min-- > 0) + as.add(a); + b = concatenate(as); + } + if (max > 0) { + Automaton d = a.clone(); + while (--max > 0) { + Automaton c = a.clone(); + for (State p : c.getAcceptStates()) + p.addEpsilon(d.initial); + d = c; + } + for (State p : b.getAcceptStates()) + p.addEpsilon(d.initial); + b.deterministic = false; + b.clearHashCode(); + b.checkMinimizeAlways(); + } + return b; + } + + /** + * Returns a (deterministic) automaton that accepts the complement of the + * language of the given automaton. + *

+ * Complexity: linear in number of states (if already deterministic). + */ + static public Automaton complement(Automaton a) { + a = a.cloneExpandedIfRequired(); + a.determinize(); + a.totalize(); + for (State p : a.getStates()) + p.accept = !p.accept; + a.removeDeadTransitions(); + return a; + } + + /** + * Returns a (deterministic) automaton that accepts the intersection of the + * language of a1 and the complement of the language of + * a2. As a side-effect, the automata may be determinized, if not + * already deterministic. + *

+ * Complexity: quadratic in number of states (if already deterministic). + */ + static public Automaton minus(Automaton a1, Automaton a2) { + if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata + .makeEmpty(); + if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired(); + if (a1.isSingleton()) { + if (BasicOperations.run(a2, a1.singleton)) return BasicAutomata.makeEmpty(); + else return a1.cloneIfRequired(); + } + return intersection(a1, a2.complement()); + } + + /** + * Returns an automaton that accepts the intersection of the languages of the + * given automata. Never modifies the input automata languages. + *

+ * Complexity: quadratic in number of states. + */ + static public Automaton intersection(Automaton a1, Automaton a2) { + if (a1.isSingleton()) { + if (BasicOperations.run(a2, a1.singleton)) return a1.cloneIfRequired(); + else return BasicAutomata.makeEmpty(); + } + if (a2.isSingleton()) { + if (BasicOperations.run(a1, a2.singleton)) return a2.cloneIfRequired(); + else return BasicAutomata.makeEmpty(); + } + if (a1 == a2) return a1.cloneIfRequired(); + Transition[][] transitions1 = Automaton + .getSortedTransitions(a1.getStates()); + Transition[][] transitions2 = Automaton + .getSortedTransitions(a2.getStates()); + Automaton c = new Automaton(); + LinkedList worklist = new LinkedList(); + HashMap newstates = new HashMap(); + StatePair p = new StatePair(c.initial, a1.initial, a2.initial); + worklist.add(p); + newstates.put(p, p); + while (worklist.size() > 0) { + p = worklist.removeFirst(); + p.s.accept = p.s1.accept && p.s2.accept; + Transition[] t1 = transitions1[p.s1.number]; + Transition[] t2 = transitions2[p.s2.number]; + for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { + while (b2 < t2.length && t2[b2].max < t1[n1].min) + b2++; + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) + if (t2[n2].max >= t1[n1].min) { + StatePair q = new StatePair(t1[n1].to, t2[n2].to); + StatePair r = newstates.get(q); + if (r == null) { + q.s = new State(); + worklist.add(q); + newstates.put(q, q); + r = q; + } + char min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; + char max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; + p.s.transitions.add(new Transition(min, max, r.s)); + } + } + } + c.deterministic = a1.deterministic && a2.deterministic; + c.removeDeadTransitions(); + c.checkMinimizeAlways(); + return c; + } + + /** + * Returns true if the language of a1 is a subset of the language + * of a2. As a side-effect, a2 is determinized if + * not already marked as deterministic. + *

+ * Complexity: quadratic in number of states. + */ + public static boolean subsetOf(Automaton a1, Automaton a2) { + if (a1 == a2) return true; + if (a1.isSingleton()) { + if (a2.isSingleton()) return a1.singleton.equals(a2.singleton); + return BasicOperations.run(a2, a1.singleton); + } + a2.determinize(); + Transition[][] transitions1 = Automaton + .getSortedTransitions(a1.getStates()); + Transition[][] transitions2 = Automaton + .getSortedTransitions(a2.getStates()); + LinkedList worklist = new LinkedList(); + HashSet visited = new HashSet(); + StatePair p = new StatePair(a1.initial, a2.initial); + worklist.add(p); + visited.add(p); + while (worklist.size() > 0) { + p = worklist.removeFirst(); + if (p.s1.accept && !p.s2.accept) return false; + Transition[] t1 = transitions1[p.s1.number]; + Transition[] t2 = transitions2[p.s2.number]; + for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { + while (b2 < t2.length && t2[b2].max < t1[n1].min) + b2++; + int min1 = t1[n1].min, max1 = t1[n1].max; + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { + if (t2[n2].min > min1) return false; + if (t2[n2].max < Character.MAX_VALUE) min1 = t2[n2].max + 1; + else { + min1 = Character.MAX_VALUE; + max1 = Character.MIN_VALUE; + } + StatePair q = new StatePair(t1[n1].to, t2[n2].to); + if (!visited.contains(q)) { + worklist.add(q); + visited.add(q); + } + } + if (min1 <= max1) return false; + } + } + return true; + } + + /** + * Returns an automaton that accepts the union of the languages of the given + * automata. + *

+ * Complexity: linear in number of states. + */ + public static Automaton union(Automaton a1, Automaton a2) { + if ((a1.isSingleton() && a2.isSingleton() && a1.singleton + .equals(a2.singleton)) + || a1 == a2) return a1.cloneIfRequired(); + if (a1 == a2) { + a1 = a1.cloneExpanded(); + a2 = a2.cloneExpanded(); + } else { + a1 = a1.cloneExpandedIfRequired(); + a2 = a2.cloneExpandedIfRequired(); + } + State s = new State(); + s.addEpsilon(a1.initial); + s.addEpsilon(a2.initial); + a1.initial = s; + a1.deterministic = false; + a1.clearHashCode(); + a1.checkMinimizeAlways(); + return a1; + } + + /** + * Returns an automaton that accepts the union of the languages of the given + * automata. + *

+ * Complexity: linear in number of states. + */ + public static Automaton union(Collection l) { + Set ids = new HashSet(); + for (Automaton a : l) + ids.add(System.identityHashCode(a)); + boolean has_aliases = ids.size() != l.size(); + State s = new State(); + for (Automaton b : l) { + if (BasicOperations.isEmpty(b)) continue; + Automaton bb = b; + if (has_aliases) bb = bb.cloneExpanded(); + else bb = bb.cloneExpandedIfRequired(); + s.addEpsilon(bb.initial); + } + Automaton a = new Automaton(); + a.initial = s; + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + return a; + } + + /** + * Determinizes the given automaton. + *

+ * Complexity: exponential in number of states. + */ + public static void determinize(Automaton a) { + if (a.deterministic || a.isSingleton()) return; + Set initialset = new HashSet(); + initialset.add(a.initial); + determinize(a, initialset); + } + + /** + * Determinizes the given automaton using the given set of initial states. + */ + static void determinize(Automaton a, Set initialset) { + char[] points = a.getStartPoints(); + // subset construction + Map,Set> sets = new HashMap,Set>(); + LinkedList> worklist = new LinkedList>(); + Map,State> newstate = new HashMap,State>(); + sets.put(initialset, initialset); + worklist.add(initialset); + a.initial = new State(); + newstate.put(initialset, a.initial); + while (worklist.size() > 0) { + Set s = worklist.removeFirst(); + State r = newstate.get(s); + for (State q : s) + if (q.accept) { + r.accept = true; + break; + } + for (int n = 0; n < points.length; n++) { + Set p = new HashSet(); + for (State q : s) + for (Transition t : q.transitions) + if (t.min <= points[n] && points[n] <= t.max) p.add(t.to); + if (!sets.containsKey(p)) { + sets.put(p, p); + worklist.add(p); + newstate.put(p, new State()); + } + State q = newstate.get(p); + char min = points[n]; + char max; + if (n + 1 < points.length) max = (char) (points[n + 1] - 1); + else max = Character.MAX_VALUE; + r.transitions.add(new Transition(min, max, q)); + } + } + a.deterministic = true; + a.removeDeadTransitions(); + } + + /** + * Adds epsilon transitions to the given automaton. This method adds extra + * character interval transitions that are equivalent to the given set of + * epsilon transitions. + * + * @param pairs collection of {@link StatePair} objects representing pairs of + * source/destination states where epsilon transitions should be + * added + */ + public static void addEpsilons(Automaton a, Collection pairs) { + a.expandSingleton(); + HashMap> forward = new HashMap>(); + HashMap> back = new HashMap>(); + for (StatePair p : pairs) { + HashSet to = forward.get(p.s1); + if (to == null) { + to = new HashSet(); + forward.put(p.s1, to); + } + to.add(p.s2); + HashSet from = back.get(p.s2); + if (from == null) { + from = new HashSet(); + back.put(p.s2, from); + } + from.add(p.s1); + } + // calculate epsilon closure + LinkedList worklist = new LinkedList(pairs); + HashSet workset = new HashSet(pairs); + while (!worklist.isEmpty()) { + StatePair p = worklist.removeFirst(); + workset.remove(p); + HashSet to = forward.get(p.s2); + HashSet from = back.get(p.s1); + if (to != null) { + for (State s : to) { + StatePair pp = new StatePair(p.s1, s); + if (!pairs.contains(pp)) { + pairs.add(pp); + forward.get(p.s1).add(s); + back.get(s).add(p.s1); + worklist.add(pp); + workset.add(pp); + if (from != null) { + for (State q : from) { + StatePair qq = new StatePair(q, p.s1); + if (!workset.contains(qq)) { + worklist.add(qq); + workset.add(qq); + } + } + } + } + } + } + } + // add transitions + for (StatePair p : pairs) + p.s1.addEpsilon(p.s2); + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + } + + /** + * Returns true if the given automaton accepts the empty string and nothing + * else. + */ + public static boolean isEmptyString(Automaton a) { + if (a.isSingleton()) return a.singleton.length() == 0; + else return a.initial.accept && a.initial.transitions.isEmpty(); + } + + /** + * Returns true if the given automaton accepts no strings. + */ + public static boolean isEmpty(Automaton a) { + if (a.isSingleton()) return false; + return !a.initial.accept && a.initial.transitions.isEmpty(); + } + + /** + * Returns true if the given automaton accepts all strings. + */ + public static boolean isTotal(Automaton a) { + if (a.isSingleton()) return false; + if (a.initial.accept && a.initial.transitions.size() == 1) { + Transition t = a.initial.transitions.iterator().next(); + return t.to == a.initial && t.min == Character.MIN_VALUE + && t.max == Character.MAX_VALUE; + } + return false; + } + + /** + * Returns true if the given string is accepted by the automaton. + *

+ * Complexity: linear in the length of the string. + *

+ * Note: for full performance, use the {@link RunAutomaton} class. + */ + public static boolean run(Automaton a, String s) { + if (a.isSingleton()) return s.equals(a.singleton); + if (a.deterministic) { + State p = a.initial; + for (int i = 0; i < s.length(); i++) { + State q = p.step(s.charAt(i)); + if (q == null) return false; + p = q; + } + return p.accept; + } else { + Set states = a.getStates(); + Automaton.setStateNumbers(states); + LinkedList pp = new LinkedList(); + LinkedList pp_other = new LinkedList(); + BitSet bb = new BitSet(states.size()); + BitSet bb_other = new BitSet(states.size()); + pp.add(a.initial); + ArrayList dest = new ArrayList(); + boolean accept = a.initial.accept; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + accept = false; + pp_other.clear(); + bb_other.clear(); + for (State p : pp) { + dest.clear(); + p.step(c, dest); + for (State q : dest) { + if (q.accept) accept = true; + if (!bb_other.get(q.number)) { + bb_other.set(q.number); + pp_other.add(q); + } + } + } + LinkedList tp = pp; + pp = pp_other; + pp_other = tp; + BitSet tb = bb; + bb = bb_other; + bb_other = tb; + } + return accept; + } + } +} Property changes on: src\java\org\apache\lucene\util\automaton\BasicOperations.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/MinimizationOperations.java =================================================================== --- src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (revision 0) @@ -0,0 +1,278 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.Set; + +/** + * Operations for minimizing automata. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +final public class MinimizationOperations { + + private MinimizationOperations() {} + + /** + * Minimizes (and determinizes if not already deterministic) the given + * automaton. + * + * @see Automaton#setMinimization(int) + */ + public static void minimize(Automaton a) { + if (!a.isSingleton()) { + minimizeHopcroft(a); + } + // recompute hash code + a.hash_code = a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2; + if (a.hash_code == 0) a.hash_code = 1; + } + + private static void initialize(ArrayList list, int size) { + for (int i = 0; i < size; i++) + list.add(null); + } + + /** + * Minimizes the given automaton using Hopcroft's algorithm. + */ + public static void minimizeHopcroft(Automaton a) { + a.determinize(); + Set tr = a.initial.getTransitions(); + if (tr.size() == 1) { + Transition t = tr.iterator().next(); + if (t.to == a.initial && t.min == Character.MIN_VALUE + && t.max == Character.MAX_VALUE) return; + } + a.totalize(); + // make arrays for numbered states and effective alphabet + Set ss = a.getStates(); + State[] states = new State[ss.size()]; + int number = 0; + for (State q : ss) { + states[number] = q; + q.number = number++; + } + char[] sigma = a.getStartPoints(); + // initialize data structures + ArrayList>> reverse = new ArrayList>>(); + for (int q = 0; q < states.length; q++) { + ArrayList> v = new ArrayList>(); + initialize(v, sigma.length); + reverse.add(v); + } + boolean[][] reverse_nonempty = new boolean[states.length][sigma.length]; + ArrayList> partition = new ArrayList>(); + initialize(partition, states.length); + int[] block = new int[states.length]; + StateList[][] active = new StateList[states.length][sigma.length]; + StateListNode[][] active2 = new StateListNode[states.length][sigma.length]; + LinkedList pending = new LinkedList(); + boolean[][] pending2 = new boolean[sigma.length][states.length]; + ArrayList split = new ArrayList(); + boolean[] split2 = new boolean[states.length]; + ArrayList refine = new ArrayList(); + boolean[] refine2 = new boolean[states.length]; + ArrayList> splitblock = new ArrayList>(); + initialize(splitblock, states.length); + for (int q = 0; q < states.length; q++) { + splitblock.set(q, new ArrayList()); + partition.set(q, new LinkedList()); + for (int x = 0; x < sigma.length; x++) { + reverse.get(q).set(x, new LinkedList()); + active[q][x] = new StateList(); + } + } + // find initial partition and reverse edges + for (int q = 0; q < states.length; q++) { + State qq = states[q]; + int j; + if (qq.accept) j = 0; + else j = 1; + partition.get(j).add(qq); + block[qq.number] = j; + for (int x = 0; x < sigma.length; x++) { + char y = sigma[x]; + State p = qq.step(y); + reverse.get(p.number).get(x).add(qq); + reverse_nonempty[p.number][x] = true; + } + } + // initialize active sets + for (int j = 0; j <= 1; j++) + for (int x = 0; x < sigma.length; x++) + for (State qq : partition.get(j)) + if (reverse_nonempty[qq.number][x]) active2[qq.number][x] = active[j][x] + .add(qq); + // initialize pending + for (int x = 0; x < sigma.length; x++) { + int a0 = active[0][x].size; + int a1 = active[1][x].size; + int j; + if (a0 <= a1) j = 0; + else j = 1; + pending.add(new IntPair(j, x)); + pending2[x][j] = true; + } + // process pending until fixed point + int k = 2; + while (!pending.isEmpty()) { + IntPair ip = pending.removeFirst(); + int p = ip.n1; + int x = ip.n2; + pending2[x][p] = false; + // find states that need to be split off their blocks + for (StateListNode m = active[p][x].first; m != null; m = m.next) + for (State s : reverse.get(m.q.number).get(x)) + if (!split2[s.number]) { + split2[s.number] = true; + split.add(s); + int j = block[s.number]; + splitblock.get(j).add(s); + if (!refine2[j]) { + refine2[j] = true; + refine.add(j); + } + } + // refine blocks + for (int j : refine) { + if (splitblock.get(j).size() < partition.get(j).size()) { + LinkedList b1 = partition.get(j); + LinkedList b2 = partition.get(k); + for (State s : splitblock.get(j)) { + b1.remove(s); + b2.add(s); + block[s.number] = k; + for (int c = 0; c < sigma.length; c++) { + StateListNode sn = active2[s.number][c]; + if (sn != null && sn.sl == active[j][c]) { + sn.remove(); + active2[s.number][c] = active[k][c].add(s); + } + } + } + // update pending + for (int c = 0; c < sigma.length; c++) { + int aj = active[j][c].size; + int ak = active[k][c].size; + if (!pending2[c][j] && 0 < aj && aj <= ak) { + pending2[c][j] = true; + pending.add(new IntPair(j, c)); + } else { + pending2[c][k] = true; + pending.add(new IntPair(k, c)); + } + } + k++; + } + for (State s : splitblock.get(j)) + split2[s.number] = false; + refine2[j] = false; + splitblock.get(j).clear(); + } + split.clear(); + refine.clear(); + } + // make a new state for each equivalence class, set initial state + State[] newstates = new State[k]; + for (int n = 0; n < newstates.length; n++) { + State s = new State(); + newstates[n] = s; + for (State q : partition.get(n)) { + if (q == a.initial) a.initial = s; + s.accept = q.accept; + s.number = q.number; // select representative + q.number = n; + } + } + // build transitions and set acceptance + for (int n = 0; n < newstates.length; n++) { + State s = newstates[n]; + s.accept = states[s.number].accept; + for (Transition t : states[s.number].transitions) + s.transitions.add(new Transition(t.min, t.max, newstates[t.to.number])); + } + a.removeDeadTransitions(); + } + + static class IntPair { + + int n1, n2; + + IntPair(int n1, int n2) { + this.n1 = n1; + this.n2 = n2; + } + } + + static class StateList { + + int size; + + StateListNode first, last; + + StateListNode add(State q) { + return new StateListNode(q, this); + } + } + + static class StateListNode { + + State q; + + StateListNode next, prev; + + StateList sl; + + StateListNode(State q, StateList sl) { + this.q = q; + this.sl = sl; + if (sl.size++ == 0) sl.first = sl.last = this; + else { + sl.last.next = this; + prev = sl.last; + sl.last = this; + } + } + + void remove() { + sl.size--; + if (sl.first == this) sl.first = next; + else prev.next = next; + if (sl.last == this) sl.last = prev; + else next.prev = prev; + } + } +} Property changes on: src\java\org\apache\lucene\util\automaton\MinimizationOperations.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/package.html =================================================================== --- src/java/org/apache/lucene/util/automaton/package.html (revision 0) +++ src/java/org/apache/lucene/util/automaton/package.html (revision 0) @@ -0,0 +1,50 @@ + + + + +Finite-state automaton for regular expressions. +

+This package contains a full DFA/NFA implementation with Unicode +alphabet and support for all standard (and a number of non-standard) +regular expression operations. +

+The most commonly used functionality is located in the classes +{@link org.apache.lucene.util.automaton.Automaton} and +{@link org.apache.lucene.util.automaton.RegExp}. +

+For more information, go to the package home page at +http://www.brics.dk/automaton/. +

+WARNING: The status of the Automaton feature is experimental. +The APIs introduced here might change in the future and will not be +supported anymore in such a case. + + Property changes on: src\java\org\apache\lucene\util\automaton\package.html ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/RegExp.java =================================================================== --- src/java/org/apache/lucene/util/automaton/RegExp.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/RegExp.java (revision 0) @@ -0,0 +1,1003 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Regular Expression extension to Automaton. + *

+ * Regular expressions are built from the following abstract syntax: + *

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
regexp::=unionexp
|
unionexp::=interexp | unionexp(union)
|interexp
interexp::=concatexp & interexp(intersection)[OPTIONAL]
|concatexp
concatexp::=repeatexp concatexp(concatenation)
|repeatexp
repeatexp::=repeatexp ?(zero or one occurrence)
|repeatexp *(zero or more occurrences)
|repeatexp +(one or more occurrences)
|repeatexp {n}(n occurrences)
|repeatexp {n,}(n or more occurrences)
|repeatexp {n,m}(n to m occurrences, including both)
|complexp
complexp::=~ complexp(complement)[OPTIONAL]
|charclassexp
charclassexp::=[ charclasses ](character class)
|[^ charclasses ](negated character class)
|simpleexp
charclasses::=charclass charclasses
|charclass
charclass::=charexp - charexp(character range, including end-points)
|charexp
simpleexp::=charexp
|.(any single character)
|#(the empty language)[OPTIONAL]
|@(any string)[OPTIONAL]
|" <Unicode string without double-quotes>  "(a string)
|( )(the empty string)
|( unionexp )(precedence override)
|< <identifier> >(named automaton)[OPTIONAL]
|<n-m>(numerical interval)[OPTIONAL]
charexp::=<Unicode character>(a single non-reserved character)
|\ <Unicode character> (a single character)
+ *

+ * The productions marked [OPTIONAL] are only allowed if + * specified by the syntax flags passed to the RegExp constructor. + * The reserved characters used in the (enabled) syntax must be escaped with + * backslash (\) or double-quotes ("..."). (In + * contrast to other regexp syntaxes, this is required also in character + * classes.) Be aware that dash (-) has a special meaning in + * charclass expressions. An identifier is a string not containing right + * angle bracket (>) or dash (-). Numerical + * intervals are specified by non-negative decimal integers and include both end + * points, and if n and m have the same number + * of digits, then the conforming strings must have that length (i.e. prefixed + * by 0's). + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class RegExp { + + enum Kind { + REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL + } + + /** + * Syntax flag, enables intersection (&). + */ + public static final int INTERSECTION = 0x0001; + + /** + * Syntax flag, enables complement (~). + */ + public static final int COMPLEMENT = 0x0002; + + /** + * Syntax flag, enables empty language (#). + */ + public static final int EMPTY = 0x0004; + + /** + * Syntax flag, enables anystring (@). + */ + public static final int ANYSTRING = 0x0008; + + /** + * Syntax flag, enables named automata (<identifier>). + */ + public static final int AUTOMATON = 0x0010; + + /** + * Syntax flag, enables numerical intervals ( + * <n-m>). + */ + public static final int INTERVAL = 0x0020; + + /** + * Syntax flag, enables all optional regexp syntax. + */ + public static final int ALL = 0xffff; + + /** + * Syntax flag, enables no optional regexp syntax. + */ + public static final int NONE = 0x0000; + + private static boolean allow_mutation = false; + + Kind kind; + RegExp exp1, exp2; + String s; + char c; + int min, max, digits; + char from, to; + + String b; + int flags; + int pos; + + RegExp() {} + + /** + * Constructs new RegExp from a string. Same as + * RegExp(s, ALL). + * + * @param s regexp string + * @exception IllegalArgumentException if an error occured while parsing the + * regular expression + */ + public RegExp(String s) throws IllegalArgumentException { + this(s, ALL); + } + + /** + * Constructs new RegExp from a string. + * + * @param s regexp string + * @param syntax_flags boolean 'or' of optional syntax constructs to be + * enabled + * @exception IllegalArgumentException if an error occured while parsing the + * regular expression + */ + public RegExp(String s, int syntax_flags) throws IllegalArgumentException { + b = s; + flags = syntax_flags; + RegExp e; + if (s.length() == 0) e = makeString(""); + else { + e = parseUnionExp(); + if (pos < b.length()) throw new IllegalArgumentException( + "end-of-string expected at position " + pos); + } + kind = e.kind; + exp1 = e.exp1; + exp2 = e.exp2; + this.s = e.s; + c = e.c; + min = e.min; + max = e.max; + digits = e.digits; + from = e.from; + to = e.to; + b = null; + } + + /** + * Constructs new Automaton from this RegExp. Same + * as toAutomaton(null) (empty automaton map). + */ + public Automaton toAutomaton() { + return toAutomatonAllowMutate(null, null); + } + + /** + * Constructs new Automaton from this RegExp. The + * constructed automaton is minimal and deterministic and has no transitions + * to dead states. + * + * @param automaton_provider provider of automata for named identifiers + * @exception IllegalArgumentException if this regular expression uses a named + * identifier that is not available from the automaton provider + */ + public Automaton toAutomaton(AutomatonProvider automaton_provider) + throws IllegalArgumentException { + return toAutomatonAllowMutate(null, automaton_provider); + } + + /** + * Constructs new Automaton from this RegExp. The + * constructed automaton is minimal and deterministic and has no transitions + * to dead states. + * + * @param automata a map from automaton identifiers to automata (of type + * Automaton). + * @exception IllegalArgumentException if this regular expression uses a named + * identifier that does not occur in the automaton map + */ + public Automaton toAutomaton(Map automata) + throws IllegalArgumentException { + return toAutomatonAllowMutate(automata, null); + } + + /** + * Sets or resets allow mutate flag. If this flag is set, then automata + * construction uses mutable automata, which is slightly faster but not thread + * safe. By default, the flag is not set. + * + * @param flag if true, the flag is set + * @return previous value of the flag + */ + public boolean setAllowMutate(boolean flag) { + boolean b = allow_mutation; + allow_mutation = flag; + return b; + } + + private Automaton toAutomatonAllowMutate(Map automata, + AutomatonProvider automaton_provider) throws IllegalArgumentException { + boolean b = false; + if (allow_mutation) b = Automaton.setAllowMutate(true); // thread unsafe + Automaton a = toAutomaton(automata, automaton_provider); + if (allow_mutation) Automaton.setAllowMutate(b); + return a; + } + + private Automaton toAutomaton(Map automata, + AutomatonProvider automaton_provider) throws IllegalArgumentException { + List list; + Automaton a = null; + switch (kind) { + case REGEXP_UNION: + list = new ArrayList(); + findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider); + findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider); + a = BasicOperations.union(list); + MinimizationOperations.minimize(a); + break; + case REGEXP_CONCATENATION: + list = new ArrayList(); + findLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata, + automaton_provider); + findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata, + automaton_provider); + a = BasicOperations.concatenate(list); + MinimizationOperations.minimize(a); + break; + case REGEXP_INTERSECTION: + a = exp1.toAutomaton(automata, automaton_provider).intersection( + exp2.toAutomaton(automata, automaton_provider)); + MinimizationOperations.minimize(a); + break; + case REGEXP_OPTIONAL: + a = exp1.toAutomaton(automata, automaton_provider).optional(); + MinimizationOperations.minimize(a); + break; + case REGEXP_REPEAT: + a = exp1.toAutomaton(automata, automaton_provider).repeat(); + MinimizationOperations.minimize(a); + break; + case REGEXP_REPEAT_MIN: + a = exp1.toAutomaton(automata, automaton_provider).repeat(min); + MinimizationOperations.minimize(a); + break; + case REGEXP_REPEAT_MINMAX: + a = exp1.toAutomaton(automata, automaton_provider).repeat(min, max); + MinimizationOperations.minimize(a); + break; + case REGEXP_COMPLEMENT: + a = exp1.toAutomaton(automata, automaton_provider).complement(); + MinimizationOperations.minimize(a); + break; + case REGEXP_CHAR: + a = BasicAutomata.makeChar(c); + break; + case REGEXP_CHAR_RANGE: + a = BasicAutomata.makeCharRange(from, to); + break; + case REGEXP_ANYCHAR: + a = BasicAutomata.makeAnyChar(); + break; + case REGEXP_EMPTY: + a = BasicAutomata.makeEmpty(); + break; + case REGEXP_STRING: + a = BasicAutomata.makeString(s); + break; + case REGEXP_ANYSTRING: + a = BasicAutomata.makeAnyString(); + break; + case REGEXP_AUTOMATON: + Automaton aa = null; + if (automata != null) aa = automata.get(s); + if (aa == null && automaton_provider != null) try { + aa = automaton_provider.getAutomaton(s); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + if (aa == null) throw new IllegalArgumentException("'" + s + + "' not found"); + a = aa.clone(); // always clone here (ignore allow_mutate) + break; + case REGEXP_INTERVAL: + a = BasicAutomata.makeInterval(min, max, digits); + break; + } + return a; + } + + private void findLeaves(RegExp exp, Kind kind, List list, + Map automata, AutomatonProvider automaton_provider) { + if (exp.kind == kind) { + findLeaves(exp.exp1, kind, list, automata, automaton_provider); + findLeaves(exp.exp2, kind, list, automata, automaton_provider); + } else list.add(exp.toAutomaton(automata, automaton_provider)); + } + + /** + * Constructs string from parsed regular expression. + */ + @Override + public String toString() { + return toStringBuilder(new StringBuilder()).toString(); + } + + StringBuilder toStringBuilder(StringBuilder b) { + switch (kind) { + case REGEXP_UNION: + b.append("("); + exp1.toStringBuilder(b); + b.append("|"); + exp2.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_CONCATENATION: + exp1.toStringBuilder(b); + exp2.toStringBuilder(b); + break; + case REGEXP_INTERSECTION: + b.append("("); + exp1.toStringBuilder(b); + b.append("&"); + exp2.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_OPTIONAL: + b.append("("); + exp1.toStringBuilder(b); + b.append(")?"); + break; + case REGEXP_REPEAT: + b.append("("); + exp1.toStringBuilder(b); + b.append(")*"); + break; + case REGEXP_REPEAT_MIN: + b.append("("); + exp1.toStringBuilder(b); + b.append("){").append(min).append(",}"); + break; + case REGEXP_REPEAT_MINMAX: + b.append("("); + exp1.toStringBuilder(b); + b.append("){").append(min).append(",").append(max).append("}"); + break; + case REGEXP_COMPLEMENT: + b.append("~("); + exp1.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_CHAR: + b.append("\\").append(c); + break; + case REGEXP_CHAR_RANGE: + b.append("[\\").append(from).append("-\\").append(to).append("]"); + break; + case REGEXP_ANYCHAR: + b.append("."); + break; + case REGEXP_EMPTY: + b.append("#"); + break; + case REGEXP_STRING: + b.append("\"").append(s).append("\""); + break; + case REGEXP_ANYSTRING: + b.append("@"); + break; + case REGEXP_AUTOMATON: + b.append("<").append(s).append(">"); + break; + case REGEXP_INTERVAL: + String s1 = Integer.toString(min); + String s2 = Integer.toString(max); + b.append("<"); + if (digits > 0) for (int i = s1.length(); i < digits; i++) + b.append('0'); + b.append(s1).append("-"); + if (digits > 0) for (int i = s2.length(); i < digits; i++) + b.append('0'); + b.append(s2).append(">"); + break; + } + return b; + } + + /** + * Returns set of automaton identifiers that occur in this regular expression. + */ + public Set getIdentifiers() { + HashSet set = new HashSet(); + getIdentifiers(set); + return set; + } + + void getIdentifiers(Set set) { + switch (kind) { + case REGEXP_UNION: + case REGEXP_CONCATENATION: + case REGEXP_INTERSECTION: + exp1.getIdentifiers(set); + exp2.getIdentifiers(set); + break; + case REGEXP_OPTIONAL: + case REGEXP_REPEAT: + case REGEXP_REPEAT_MIN: + case REGEXP_REPEAT_MINMAX: + case REGEXP_COMPLEMENT: + exp1.getIdentifiers(set); + break; + case REGEXP_AUTOMATON: + set.add(s); + break; + default: + } + } + + static RegExp makeUnion(RegExp exp1, RegExp exp2) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_UNION; + r.exp1 = exp1; + r.exp2 = exp2; + return r; + } + + static RegExp makeConcatenation(RegExp exp1, RegExp exp2) { + if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) + && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString( + exp1, exp2); + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_CONCATENATION; + if (exp1.kind == Kind.REGEXP_CONCATENATION + && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING) + && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) { + r.exp1 = exp1.exp1; + r.exp2 = makeString(exp1.exp2, exp2); + } else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) + && exp2.kind == Kind.REGEXP_CONCATENATION + && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) { + r.exp1 = makeString(exp1, exp2.exp1); + r.exp2 = exp2.exp2; + } else { + r.exp1 = exp1; + r.exp2 = exp2; + } + return r; + } + + static private RegExp makeString(RegExp exp1, RegExp exp2) { + StringBuilder b = new StringBuilder(); + if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s); + else b.append(exp1.c); + if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s); + else b.append(exp2.c); + return makeString(b.toString()); + } + + static RegExp makeIntersection(RegExp exp1, RegExp exp2) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_INTERSECTION; + r.exp1 = exp1; + r.exp2 = exp2; + return r; + } + + static RegExp makeOptional(RegExp exp) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_OPTIONAL; + r.exp1 = exp; + return r; + } + + static RegExp makeRepeat(RegExp exp) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_REPEAT; + r.exp1 = exp; + return r; + } + + static RegExp makeRepeat(RegExp exp, int min) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_REPEAT_MIN; + r.exp1 = exp; + r.min = min; + return r; + } + + static RegExp makeRepeat(RegExp exp, int min, int max) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_REPEAT_MINMAX; + r.exp1 = exp; + r.min = min; + r.max = max; + return r; + } + + static RegExp makeComplement(RegExp exp) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_COMPLEMENT; + r.exp1 = exp; + return r; + } + + static RegExp makeChar(char c) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_CHAR; + r.c = c; + return r; + } + + static RegExp makeCharRange(char from, char to) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_CHAR_RANGE; + r.from = from; + r.to = to; + return r; + } + + static RegExp makeAnyChar() { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_ANYCHAR; + return r; + } + + static RegExp makeEmpty() { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_EMPTY; + return r; + } + + static RegExp makeString(String s) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_STRING; + r.s = s; + return r; + } + + static RegExp makeAnyString() { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_ANYSTRING; + return r; + } + + static RegExp makeAutomaton(String s) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_AUTOMATON; + r.s = s; + return r; + } + + static RegExp makeInterval(int min, int max, int digits) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_INTERVAL; + r.min = min; + r.max = max; + r.digits = digits; + return r; + } + + private boolean peek(String s) { + return more() && s.indexOf(b.charAt(pos)) != -1; + } + + private boolean match(char c) { + if (pos >= b.length()) return false; + if (b.charAt(pos) == c) { + pos++; + return true; + } + return false; + } + + private boolean more() { + return pos < b.length(); + } + + private char next() throws IllegalArgumentException { + if (!more()) throw new IllegalArgumentException("unexpected end-of-string"); + return b.charAt(pos++); + } + + private boolean check(int flag) { + return (flags & flag) != 0; + } + + final RegExp parseUnionExp() throws IllegalArgumentException { + RegExp e = parseInterExp(); + if (match('|')) e = makeUnion(e, parseUnionExp()); + return e; + } + + final RegExp parseInterExp() throws IllegalArgumentException { + RegExp e = parseConcatExp(); + if (check(INTERSECTION) && match('&')) e = makeIntersection(e, + parseInterExp()); + return e; + } + + final RegExp parseConcatExp() throws IllegalArgumentException { + RegExp e = parseRepeatExp(); + if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation( + e, parseConcatExp()); + return e; + } + + final RegExp parseRepeatExp() throws IllegalArgumentException { + RegExp e = parseComplExp(); + while (peek("?*+{")) { + if (match('?')) e = makeOptional(e); + else if (match('*')) e = makeRepeat(e); + else if (match('+')) e = makeRepeat(e, 1); + else if (match('{')) { + int start = pos; + while (peek("0123456789")) + next(); + if (start == pos) throw new IllegalArgumentException( + "integer expected at position " + pos); + int n = Integer.parseInt(b.substring(start, pos)); + int m = -1; + if (match(',')) { + start = pos; + while (peek("0123456789")) + next(); + if (start != pos) m = Integer.parseInt(b.substring(start, pos)); + } else m = n; + if (!match('}')) throw new IllegalArgumentException( + "expected '}' at position " + pos); + if (m == -1) e = makeRepeat(e, n); + else e = makeRepeat(e, n, m); + } + } + return e; + } + + final RegExp parseComplExp() throws IllegalArgumentException { + if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp()); + else return parseCharClassExp(); + } + + final RegExp parseCharClassExp() throws IllegalArgumentException { + if (match('[')) { + boolean negate = false; + if (match('^')) negate = true; + RegExp e = parseCharClasses(); + if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e)); + if (!match(']')) throw new IllegalArgumentException( + "expected ']' at position " + pos); + return e; + } else return parseSimpleExp(); + } + + final RegExp parseCharClasses() throws IllegalArgumentException { + RegExp e = parseCharClass(); + while (more() && !peek("]")) + e = makeUnion(e, parseCharClass()); + return e; + } + + final RegExp parseCharClass() throws IllegalArgumentException { + char c = parseCharExp(); + if (match('-')) return makeCharRange(c, parseCharExp()); + else return makeChar(c); + } + + final RegExp parseSimpleExp() throws IllegalArgumentException { + if (match('.')) return makeAnyChar(); + else if (check(EMPTY) && match('#')) return makeEmpty(); + else if (check(ANYSTRING) && match('@')) return makeAnyString(); + else if (match('"')) { + int start = pos; + while (more() && !peek("\"")) + next(); + if (!match('"')) throw new IllegalArgumentException( + "expected '\"' at position " + pos); + return makeString(b.substring(start, pos - 1)); + } else if (match('(')) { + if (match(')')) return makeString(""); + RegExp e = parseUnionExp(); + if (!match(')')) throw new IllegalArgumentException( + "expected ')' at position " + pos); + return e; + } else if ((check(AUTOMATON) || check(INTERVAL)) && match('<')) { + int start = pos; + while (more() && !peek(">")) + next(); + if (!match('>')) throw new IllegalArgumentException( + "expected '>' at position " + pos); + String s = b.substring(start, pos - 1); + int i = s.indexOf('-'); + if (i == -1) { + if (!check(AUTOMATON)) throw new IllegalArgumentException( + "interval syntax error at position " + (pos - 1)); + return makeAutomaton(s); + } else { + if (!check(INTERVAL)) throw new IllegalArgumentException( + "illegal identifier at position " + (pos - 1)); + try { + if (i == 0 || i == s.length() - 1 || i != s.lastIndexOf('-')) throw new NumberFormatException(); + String smin = s.substring(0, i); + String smax = s.substring(i + 1, s.length()); + int imin = Integer.parseInt(smin); + int imax = Integer.parseInt(smax); + int digits; + if (smin.length() == smax.length()) digits = smin.length(); + else digits = 0; + if (imin > imax) { + int t = imin; + imin = imax; + imax = t; + } + return makeInterval(imin, imax, digits); + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "interval syntax error at position " + (pos - 1)); + } + } + } else return makeChar(parseCharExp()); + } + + final char parseCharExp() throws IllegalArgumentException { + match('\\'); + return next(); + } +} Property changes on: src\java\org\apache\lucene\util\automaton\RegExp.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/RunAutomaton.java =================================================================== --- src/java/org/apache/lucene/util/automaton/RunAutomaton.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/RunAutomaton.java (revision 0) @@ -0,0 +1,238 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; +import java.util.Set; + +/** + * Finite-state automaton with fast run operation. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class RunAutomaton implements Serializable { + + static final long serialVersionUID = 20001; + + int size; + boolean[] accept; + int initial; + int[] transitions; // delta(state,c) = transitions[state*points.length + + // getCharClass(c)] + char[] points; // char interval start points + int[] classmap; // map from char number to class class + + /** + * Sets alphabet table for optimal run performance. + */ + final void setAlphabet() { + classmap = new int[Character.MAX_VALUE - Character.MIN_VALUE + 1]; + int i = 0; + for (int j = 0; j <= Character.MAX_VALUE - Character.MIN_VALUE; j++) { + if (i + 1 < points.length && j == points[i + 1]) i++; + classmap[j] = i; + } + } + + /** + * Returns a string representation of this automaton. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append("initial state: ").append(initial).append("\n"); + for (int i = 0; i < size; i++) { + b.append("state " + i); + if (accept[i]) b.append(" [accept]:\n"); + else b.append(" [reject]:\n"); + for (int j = 0; j < points.length; j++) { + int k = transitions[i * points.length + j]; + if (k != -1) { + char min = points[j]; + char max; + if (j + 1 < points.length) max = (char) (points[j + 1] - 1); + else max = Character.MAX_VALUE; + b.append(" "); + Transition.appendCharString(min, b); + if (min != max) { + b.append("-"); + Transition.appendCharString(max, b); + } + b.append(" -> ").append(k).append("\n"); + } + } + } + return b.toString(); + } + + /** + * Returns number of states in automaton. + */ + public int getSize() { + return size; + } + + /** + * Returns acceptance status for given state. + */ + public boolean isAccept(int state) { + return accept[state]; + } + + /** + * Returns initial state. + */ + public int getInitialState() { + return initial; + } + + /** + * Returns array of character class interval start points. The array should + * not be modified by the caller. + */ + public char[] getCharIntervals() { + return points.clone(); + } + + /** + * Gets character class of given char. + */ + int getCharClass(char c) { + return SpecialOperations.findIndex(c, points); + } + + @SuppressWarnings("unused") + private RunAutomaton() {} + + /** + * Constructs a new RunAutomaton from a deterministic + * Automaton. Same as RunAutomaton(a, true). + * + * @param a an automaton + */ + public RunAutomaton(Automaton a) { + this(a, true); + } + + /** + * Constructs a new RunAutomaton from a deterministic + * Automaton. If the given automaton is not deterministic, it is + * determinized first. + * + * @param a an automaton + * @param tableize if true, a transition table is created which makes the + * run method faster in return of a higher memory usage + */ + public RunAutomaton(Automaton a, boolean tableize) { + a.determinize(); + points = a.getStartPoints(); + Set states = a.getStates(); + Automaton.setStateNumbers(states); + initial = a.initial.number; + size = states.size(); + accept = new boolean[size]; + transitions = new int[size * points.length]; + for (int n = 0; n < size * points.length; n++) + transitions[n] = -1; + for (State s : states) { + int n = s.number; + accept[n] = s.accept; + for (int c = 0; c < points.length; c++) { + State q = s.step(points[c]); + if (q != null) transitions[n * points.length + c] = q.number; + } + } + if (tableize) setAlphabet(); + } + + /** + * Returns the state obtained by reading the given char from the given state. + * Returns -1 if not obtaining any such state. (If the original + * Automaton had no dead states, -1 is returned here if and only + * if a dead state is entered in an equivalent automaton with a total + * transition function.) + */ + public int step(int state, char c) { + if (classmap == null) return transitions[state * points.length + + getCharClass(c)]; + else return transitions[state * points.length + + classmap[c - Character.MIN_VALUE]]; + } + + /** + * Returns true if the given string is accepted by this automaton. + */ + public boolean run(String s) { + int p = initial; + int l = s.length(); + for (int i = 0; i < l; i++) { + p = step(p, s.charAt(i)); + if (p == -1) return false; + } + return accept[p]; + } + + /** + * Returns true if the given string is accepted by this automaton + */ + public boolean run(char[] s, int offset, int length) { + int p = initial; + int l = offset + length; + for (int i = offset; i < l; i++) { + p = step(p, s[i]); + if (p == -1) return false; + } + return accept[p]; + } + + /** + * Returns the length of the longest accepted run of the given string starting + * at the given offset. + * + * @param s the string + * @param offset offset into s where the run starts + * @return length of the longest accepted run, -1 if no run is accepted + */ + public int run(String s, int offset) { + int p = initial; + int l = s.length(); + int max = -1; + for (int r = 0; offset <= l; offset++, r++) { + if (accept[p]) max = r; + if (offset == l) break; + p = step(p, s.charAt(offset)); + if (p == -1) break; + } + return max; + } +} Property changes on: src\java\org\apache\lucene\util\automaton\RunAutomaton.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/SpecialOperations.java =================================================================== --- src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 0) @@ -0,0 +1,182 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +/** + * Special automata operations. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +final public class SpecialOperations { + + private SpecialOperations() {} + + /** + * Finds the largest entry whose value is less than or equal to c, or 0 if + * there is no such entry. + */ + static int findIndex(char c, char[] points) { + int a = 0; + int b = points.length; + while (b - a > 1) { + int d = (a + b) >>> 1; + if (points[d] > c) b = d; + else if (points[d] < c) a = d; + else return d; + } + return a; + } + + /** + * Returns true if the language of this automaton is finite. + */ + public static boolean isFinite(Automaton a) { + if (a.isSingleton()) return true; + return isFinite(a.initial, new HashSet()); + } + + /** + * Checks whether there is a loop containing s. (This is sufficient since + * there are never transitions to dead states.) + */ + private static boolean isFinite(State s, HashSet path) { + path.add(s); + for (Transition t : s.transitions) + if (path.contains(t.to) || !isFinite(t.to, path)) return false; + path.remove(s); + return true; + } + + /** + * Returns the longest string that is a prefix of all accepted strings and + * visits each state at most once. + * + * @return common prefix + */ + public static String getCommonPrefix(Automaton a) { + if (a.isSingleton()) return a.singleton; + StringBuilder b = new StringBuilder(); + HashSet visited = new HashSet(); + State s = a.initial; + boolean done; + do { + done = true; + visited.add(s); + if (!s.accept && s.transitions.size() == 1) { + Transition t = s.transitions.iterator().next(); + if (t.min == t.max && !visited.contains(t.to)) { + b.append(t.min); + s = t.to; + done = false; + } + } + } while (!done); + return b.toString(); + } + + /** + * Returns the longest string that is a suffix of all accepted strings and + * visits each state at most once. + * + * @return common suffix + */ + public static String getCommonSuffix(Automaton a) { + if (a.isSingleton()) // if singleton, the suffix is the string itself. + return a.singleton; + + // reverse the language of the automaton, then reverse its common prefix. + Automaton r = a.clone(); + reverse(r); + r.determinize(); + return reverseUnicode3(SpecialOperations.getCommonPrefix(r)); + } + + /** + * Reverses the language of the given (non-singleton) automaton while returning + * the set of new initial states. + */ + private static Set reverse(Automaton a) { + a.expandSingleton(); + // reverse all edges + HashMap> m = new HashMap>(); + Set states = a.getStates(); + Set accept = a.getAcceptStates(); + for (State r : states) { + m.put(r, new HashSet()); + r.accept = false; + } + for (State r : states) + for (Transition t : r.getTransitions()) + m.get(t.to).add(new Transition(t.min, t.max, r)); + for (State r : states) + r.transitions = m.get(r); + // make new initial+final states + a.initial.accept = true; + a.initial = new State(); + for (State r : accept) + a.initial.addEpsilon(r); // ensures that all initial states are reachable + a.deterministic = false; + return accept; + } + + /** + * Intentionally use a unicode 3 reverse. + * This is because we are only going to reverse it again... + */ + private static String reverseUnicode3( final String input ){ + char[] charInput = input.toCharArray(); + reverseUnicode3(charInput, 0, charInput.length); + return new String(charInput); + } + + /** + * Intentionally use a unicode 3 reverse. + * This is because it is only used by getCommonSuffix(), + * which will reverse the entire FSM using code unit reversal, + * so we must then reverse its common prefix back using the + * same code point reversal. + */ + private static void reverseUnicode3(char[] buffer, int start, int len){ + if (len <= 1) return; + int num = len>>1; + for (int i = start; i < ( start + num ); i++) { + char c = buffer[i]; + buffer[i] = buffer[start * 2 + len - i - 1]; + buffer[start * 2 + len - i - 1] = c; + } + } +} Property changes on: src\java\org\apache\lucene\util\automaton\SpecialOperations.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/State.java =================================================================== --- src/java/org/apache/lucene/util/automaton/State.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/State.java (revision 0) @@ -0,0 +1,202 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Automaton state. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class State implements Serializable, Comparable { + + static final long serialVersionUID = 30001; + + boolean accept; + Set transitions; + + int number; + + int id; + static int next_id; + + /** + * Constructs a new state. Initially, the new state is a reject state. + */ + public State() { + resetTransitions(); + id = next_id++; + } + + /** + * Resets transition set. + */ + final void resetTransitions() { + transitions = new HashSet(); + } + + /** + * Returns the set of outgoing transitions. Subsequent changes are reflected + * in the automaton. + * + * @return transition set + */ + public Set getTransitions() { + return transitions; + } + + /** + * Adds an outgoing transition. + * + * @param t transition + */ + public void addTransition(Transition t) { + transitions.add(t); + } + + /** + * Sets acceptance for this state. + * + * @param accept if true, this state is an accept state + */ + public void setAccept(boolean accept) { + this.accept = accept; + } + + /** + * Returns acceptance status. + * + * @return true is this is an accept state + */ + public boolean isAccept() { + return accept; + } + + /** + * Performs lookup in transitions, assuming determinism. + * + * @param c character to look up + * @return destination state, null if no matching outgoing transition + * @see #step(char, Collection) + */ + public State step(char c) { + for (Transition t : transitions) + if (t.min <= c && c <= t.max) return t.to; + return null; + } + + /** + * Performs lookup in transitions, allowing nondeterminism. + * + * @param c character to look up + * @param dest collection where destination states are stored + * @see #step(char) + */ + public void step(char c, Collection dest) { + for (Transition t : transitions) + if (t.min <= c && c <= t.max) dest.add(t.to); + } + + void addEpsilon(State to) { + if (to.accept) accept = true; + for (Transition t : to.transitions) + transitions.add(t); + } + + /** + * Returns transitions sorted by (min, reverse max, to) or (to, min, reverse + * max) + */ + Transition[] getSortedTransitionArray(boolean to_first) { + Transition[] e = transitions.toArray(new Transition[transitions.size()]); + Arrays.sort(e, new TransitionComparator(to_first)); + return e; + } + + /** + * Returns sorted list of outgoing transitions. + * + * @param to_first if true, order by (to, min, reverse max); otherwise (min, + * reverse max, to) + * @return transition list + */ + public List getSortedTransitions(boolean to_first) { + return Arrays.asList(getSortedTransitionArray(to_first)); + } + + /** + * Returns string describing this state. Normally invoked via + * {@link Automaton#toString()}. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append("state ").append(number); + if (accept) b.append(" [accept]"); + else b.append(" [reject]"); + b.append(":\n"); + for (Transition t : transitions) + b.append(" ").append(t.toString()).append("\n"); + return b.toString(); + } + + /** + * Compares this object with the specified object for order. States are + * ordered by the time of construction. + */ + public int compareTo(State s) { + return s.id - id; + } + + /** + * See {@link java.lang.Object#equals(java.lang.Object)}. + */ + @Override + public boolean equals(Object obj) { + return super.equals(obj); + } + + /** + * See {@link java.lang.Object#hashCode()}. + */ + @Override + public int hashCode() { + return super.hashCode(); + } +} Property changes on: src\java\org\apache\lucene\util\automaton\State.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/StatePair.java =================================================================== --- src/java/org/apache/lucene/util/automaton/StatePair.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/StatePair.java (revision 0) @@ -0,0 +1,104 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +/** + * Pair of states. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class StatePair { + State s; + State s1; + State s2; + + StatePair(State s, State s1, State s2) { + this.s = s; + this.s1 = s1; + this.s2 = s2; + } + + /** + * Constructs a new state pair. + * + * @param s1 first state + * @param s2 second state + */ + public StatePair(State s1, State s2) { + this.s1 = s1; + this.s2 = s2; + } + + /** + * Returns first component of this pair. + * + * @return first state + */ + public State getFirstState() { + return s1; + } + + /** + * Returns second component of this pair. + * + * @return second state + */ + public State getSecondState() { + return s2; + } + + /** + * Checks for equality. + * + * @param obj object to compare with + * @return true if obj represents the same pair of states as this + * pair + */ + @Override + public boolean equals(Object obj) { + if (obj instanceof StatePair) { + StatePair p = (StatePair) obj; + return p.s1 == s1 && p.s2 == s2; + } else return false; + } + + /** + * Returns hash code. + * + * @return hash code + */ + @Override + public int hashCode() { + return s1.hashCode() + s2.hashCode(); + } +} Property changes on: src\java\org\apache\lucene\util\automaton\StatePair.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/Transition.java =================================================================== --- src/java/org/apache/lucene/util/automaton/Transition.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/Transition.java (revision 0) @@ -0,0 +1,179 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; + +/** + * Automaton transition. + *

+ * A transition, which belongs to a source state, consists of a Unicode + * character interval and a destination state. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class Transition implements Serializable, Cloneable { + + static final long serialVersionUID = 40001; + + /* + * CLASS INVARIANT: min<=max + */ + + char min; + char max; + + State to; + + /** + * Constructs a new singleton interval transition. + * + * @param c transition character + * @param to destination state + */ + public Transition(char c, State to) { + min = max = c; + this.to = to; + } + + /** + * Constructs a new transition. Both end points are included in the interval. + * + * @param min transition interval minimum + * @param max transition interval maximum + * @param to destination state + */ + public Transition(char min, char max, State to) { + if (max < min) { + char t = max; + max = min; + min = t; + } + this.min = min; + this.max = max; + this.to = to; + } + + /** Returns minimum of this transition interval. */ + public char getMin() { + return min; + } + + /** Returns maximum of this transition interval. */ + public char getMax() { + return max; + } + + /** Returns destination of this transition. */ + public State getDest() { + return to; + } + + /** + * Checks for equality. + * + * @param obj object to compare with + * @return true if obj is a transition with same character interval + * and destination state as this transition. + */ + @Override + public boolean equals(Object obj) { + if (obj instanceof Transition) { + Transition t = (Transition) obj; + return t.min == min && t.max == max && t.to == to; + } else return false; + } + + /** + * Returns hash code. The hash code is based on the character interval (not + * the destination state). + * + * @return hash code + */ + @Override + public int hashCode() { + return min * 2 + max * 3; + } + + /** + * Clones this transition. + * + * @return clone with same character interval and destination state + */ + @Override + public Transition clone() { + try { + return (Transition) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + static void appendCharString(char c, StringBuilder b) { + if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.append(c); + else { + b.append("\\u"); + String s = Integer.toHexString(c); + if (c < 0x10) b.append("000").append(s); + else if (c < 0x100) b.append("00").append(s); + else if (c < 0x1000) b.append("0").append(s); + else b.append(s); + } + } + + /** + * Returns a string describing this state. Normally invoked via + * {@link Automaton#toString()}. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + appendCharString(min, b); + if (min != max) { + b.append("-"); + appendCharString(max, b); + } + b.append(" -> ").append(to.number); + return b.toString(); + } + + void appendDot(StringBuilder b) { + b.append(" -> ").append(to.number).append(" [label=\""); + appendCharString(min, b); + if (min != max) { + b.append("-"); + appendCharString(max, b); + } + b.append("\"]\n"); + } +} Property changes on: src\java\org\apache\lucene\util\automaton\Transition.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/TransitionComparator.java =================================================================== --- src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 0) @@ -0,0 +1,80 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; +import java.util.Comparator; + +/** + * Comparator for state {@link Transition}s that orders unicode char range + * transitions in lexicographic order. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +class TransitionComparator implements Comparator, Serializable { + + static final long serialVersionUID = 10001; + + boolean to_first; + + TransitionComparator(boolean to_first) { + this.to_first = to_first; + } + + /** + * Compares by (min, reverse max, to) or (to, min, reverse max). + */ + public int compare(Transition t1, Transition t2) { + if (to_first) { + if (t1.to != t2.to) { + if (t1.to == null) return -1; + else if (t2.to == null) return 1; + else if (t1.to.number < t2.to.number) return -1; + else if (t1.to.number > t2.to.number) return 1; + } + } + if (t1.min < t2.min) return -1; + if (t1.min > t2.min) return 1; + if (t1.max > t2.max) return -1; + if (t1.max < t2.max) return 1; + if (!to_first) { + if (t1.to != t2.to) { + if (t1.to == null) return -1; + else if (t2.to == null) return 1; + else if (t1.to.number < t2.to.number) return -1; + else if (t1.to.number > t2.to.number) return 1; + } + } + return 0; + } +} Property changes on: src\java\org\apache\lucene\util\automaton\TransitionComparator.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/test/org/apache/lucene/search/TestAutomatonQuery.java =================================================================== --- src/test/org/apache/lucene/search/TestAutomatonQuery.java (revision 0) +++ src/test/org/apache/lucene/search/TestAutomatonQuery.java (revision 0) @@ -0,0 +1,233 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collections; + +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; + +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; + +public class TestAutomatonQuery extends LuceneTestCase { + private IndexSearcher searcher; + + private final String FN = "field"; + + public void setUp() throws Exception { + super.setUp(); + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, + new StandardAnalyzer(Version.LUCENE_CURRENT, Collections.emptySet()), true, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field titleField = new Field("title", "some title", Field.Store.NO, + Field.Index.ANALYZED); + Field field = new Field(FN, "this is document one 2345", Field.Store.NO, + Field.Index.ANALYZED); + Field footerField = new Field("footer", "a footer", Field.Store.NO, + Field.Index.ANALYZED); + doc.add(titleField); + doc.add(field); + doc.add(footerField); + writer.addDocument(doc); + field.setValue("some text from doc two, a short piece. 5678.91"); + writer.addDocument(doc); + field.setValue("doc three has some different stuff: with numbers 1234 5678.9 and letter b"); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(directory, true); + } + + public void tearDown() throws Exception { + searcher.close(); + super.tearDown(); + } + + private Term newTerm(String value) { + return new Term(FN, value); + } + + private int automatonQueryNrHits(AutomatonQuery query) throws IOException { + return searcher.search(query, 5).totalHits; + } + + private void assertAutomatonHits(int expected, Automaton automaton) + throws IOException { + AutomatonQuery query = new AutomatonQuery(newTerm("bogus"), automaton); + + query.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); + assertEquals(expected, automatonQueryNrHits(query)); + } + + /** + * Test some very simple automata. + */ + public void testBasicAutomata() throws IOException { + assertAutomatonHits(0, BasicAutomata.makeEmpty()); + assertAutomatonHits(0, BasicAutomata.makeEmptyString()); + assertAutomatonHits(2, BasicAutomata.makeAnyChar()); + assertAutomatonHits(3, BasicAutomata.makeAnyString()); + assertAutomatonHits(2, BasicAutomata.makeString("doc")); + assertAutomatonHits(1, BasicAutomata.makeChar('a')); + assertAutomatonHits(2, BasicAutomata.makeCharRange('a', 'b')); + assertAutomatonHits(2, BasicAutomata.makeCharSet("ab")); + assertAutomatonHits(1, BasicAutomata.makeDecimalValue("5678.9")); + assertAutomatonHits(1, BasicAutomata.makeDecimalValue("2345")); + assertAutomatonHits(3, BasicAutomata.makeFractionDigits(3)); + assertAutomatonHits(1, BasicAutomata.makeIntegerValue("1234")); + assertAutomatonHits(2, BasicAutomata.makeInterval(1233, 2346, 0)); + assertAutomatonHits(1, BasicAutomata.makeInterval(0, 2000, 0)); + assertAutomatonHits(2, BasicAutomata.makeMaxInteger("003000")); + assertAutomatonHits(1, BasicAutomata.makeMinInteger("002000")); + assertAutomatonHits(2, BasicAutomata.makeStringMatcher("ome")); + assertAutomatonHits(2, BasicAutomata.makeTotalDigits(5)); + assertAutomatonHits(2, BasicOperations.union(BasicAutomata.makeChar('a'), + BasicAutomata.makeChar('b'))); + assertAutomatonHits(0, BasicOperations.intersection(BasicAutomata + .makeChar('a'), BasicAutomata.makeChar('b'))); + assertAutomatonHits(1, BasicOperations.minus(BasicAutomata + .makeMaxInteger("3000"), BasicAutomata.makeIntegerValue("1234"))); + } + + /** + * Test that a nondeterministic automaton works correctly. (It should will be + * determinized) + */ + public void testNFA() throws IOException { + // accept this or three, the union is an NFA (two transitions for 't' from + // initial state) + Automaton nfa = BasicOperations.union(BasicAutomata.makeString("this"), + BasicAutomata.makeString("three")); + assertAutomatonHits(2, nfa); + } + + public void testEquals() { + AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), BasicAutomata + .makeString("foobar")); + // reference to a1 + AutomatonQuery a2 = a1; + // same as a1 (accepts the same language, same term) + AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"), BasicOperations + .concatenate(BasicAutomata.makeString("foo"), BasicAutomata + .makeString("bar"))); + // different than a1 (same term, but different language) + AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"), BasicAutomata + .makeString("different")); + // different than a1 (different term, same language) + AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"), BasicAutomata + .makeString("foobar")); + + assertEquals(a1, a2); + assertEquals(a1.hashCode(), a2.hashCode()); + + assertEquals(a1, a3); + assertEquals(a1.hashCode(), a3.hashCode()); + + assertEquals(a1.toString(), a3.toString()); + + // different class + AutomatonQuery w1 = new WildcardQuery(newTerm("foobar")); + // different class + AutomatonQuery w2 = new RegexpQuery(newTerm("foobar")); + + assertFalse(a1.equals(w1)); + assertFalse(a1.equals(w2)); + assertFalse(w1.equals(w2)); + assertFalse(a1.equals(a4)); + assertFalse(a1.equals(a5)); + assertFalse(a1.equals(null)); + } + + /** + * Test that rewriting to a single term works as expected, preserves + * MultiTermQuery semantics. + */ + public void testRewriteSingleTerm() throws IOException { + AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), + BasicAutomata.makeString("piece")); + assertTrue(aq.getEnum(searcher.getIndexReader()) instanceof SingleTermEnum); + assertTrue(aq.getTermsEnum(searcher.getIndexReader()) instanceof SingleTermsEnum); + assertEquals(1, automatonQueryNrHits(aq)); + } + + /** + * Test that rewriting to a prefix query works as expected, preserves + * MultiTermQuery semantics. + */ + public void testRewritePrefix() throws IOException { + Automaton pfx = BasicAutomata.makeString("do"); + pfx.expandSingleton(); // expand singleton representation for testing + Automaton prefixAutomaton = BasicOperations.concatenate(pfx, BasicAutomata + .makeAnyString()); + AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), + prefixAutomaton); + assertTrue(aq.getEnum(searcher.getIndexReader()) instanceof PrefixTermEnum); + assertTrue(aq.getTermsEnum(searcher.getIndexReader()) instanceof PrefixTermsEnum); + assertEquals(3, automatonQueryNrHits(aq)); + } + + /** + * Test that a badly-performing automaton that must visit all the terms does + * not use the smart enumeration, this will just waste cpu. + */ + public void testLinearOptimization() throws IOException { + AutomatonQuery aq = new RegexpQuery(newTerm(".*ument")); + assertTrue(((AutomatonTermEnum) aq.getEnum(searcher.getIndexReader())).usesLinearMode()); + assertTrue(((AutomatonTermsEnum) aq.getTermsEnum(searcher.getIndexReader())).usesLinearMode()); + assertEquals(1, automatonQueryNrHits(aq)); + } + + /** + * Test that a badly-performing automaton that must visit all the terms does + * not use the smart enumeration, this will just waste cpu. + */ + public void testEmptyOptimization() throws IOException { + AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), + BasicAutomata.makeEmpty()); + //not yet available: assertTrue(aq.getEnum(searcher.getIndexReader()) instanceof EmptyTermEnum); + assertTrue(aq.getTermsEnum(searcher.getIndexReader()) instanceof EmptyTermsEnum); + assertEquals(0, automatonQueryNrHits(aq)); + } +} Property changes on: src\test\org\apache\lucene\search\TestAutomatonQuery.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java =================================================================== --- src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java (revision 0) +++ src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java (revision 0) @@ -0,0 +1,177 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Test the automaton query for several unicode corner cases, + * specifically enumerating strings/indexes containing supplementary characters, + * and the differences between UTF-8/UTF-32 and UTF-16 binary sort order. + */ +public class TestAutomatonQueryUnicode extends LuceneTestCase { + private IndexSearcher searcher; + + private final String FN = "field"; + + public void setUp() throws Exception { + super.setUp(); + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new KeywordAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field titleField = new Field("title", "some title", Field.Store.NO, + Field.Index.ANALYZED); + Field field = new Field(FN, "", Field.Store.NO, + Field.Index.ANALYZED); + Field footerField = new Field("footer", "a footer", Field.Store.NO, + Field.Index.ANALYZED); + doc.add(titleField); + doc.add(field); + doc.add(footerField); + field.setValue("\uD866\uDF05abcdef"); + writer.addDocument(doc); + field.setValue("\uD866\uDF06ghijkl"); + writer.addDocument(doc); + field.setValue("\uFB94mnopqr"); // this sorts before the previous two in UTF-8/UTF-32, but after in UTF-16!!! + writer.addDocument(doc); + field.setValue("\uFB95stuvwx"); // this one too. + writer.addDocument(doc); + field.setValue("a\uFFFCbc"); + writer.addDocument(doc); + field.setValue("a\uFFFDbc"); + writer.addDocument(doc); + field.setValue("a\uFFFEbc"); + writer.addDocument(doc); + field.setValue("a\uFB94bc"); + writer.addDocument(doc); + field.setValue("bacadaba"); + writer.addDocument(doc); + field.setValue("\uFFFD"); + writer.addDocument(doc); + field.setValue("\uFFFD\uD866\uDF05"); + writer.addDocument(doc); + field.setValue("\uFFFD\uFFFD"); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(directory, true); + } + + public void tearDown() throws Exception { + searcher.close(); + super.tearDown(); + } + + private Term newTerm(String value) { + return new Term(FN, value); + } + + private int automatonQueryNrHits(AutomatonQuery query) throws IOException { + return searcher.search(query, 5).totalHits; + } + + private void assertAutomatonHits(int expected, Automaton automaton) + throws IOException { + AutomatonQuery query = new AutomatonQuery(newTerm("bogus"), automaton); + + query.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); + assertEquals(expected, automatonQueryNrHits(query)); + + query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); + assertEquals(expected, automatonQueryNrHits(query)); + } + + /** + * Test that AutomatonQuery interacts with lucene's sort order correctly. + * + * This expression matches something either starting with the arabic presentation forms block, + * or a supplementary character. + */ + public void testSortOrder() throws IOException { + Automaton a = new RegExp("((\uD866\uDF05)|\uFB94).*").toAutomaton(); + assertAutomatonHits(2, a); + } + + /** + * Test that AutomatonQuery properly seeks to supplementary characters. + * Transitions are modeled as UTF-16 code units, so without special handling + * by default it will try to seek to a lead surrogate with some DFAs + */ + public void testSeekSurrogate() throws IOException { + Automaton a = new RegExp("\uD866[a\uDF05\uFB93][a-z]{0,5}[fl]").toAutomaton(); + assertAutomatonHits(1, a); + } + + /** + * Try seeking to an ending lead surrogate. + */ + public void testSeekSurrogate2() throws IOException { + Automaton a = new RegExp("\uD866(\uDF06ghijkl)?").toAutomaton(); + assertAutomatonHits(1, a); + } + + /** + * Try seeking to an starting trail surrogate. + */ + public void testSeekSurrogate3() throws IOException { + Automaton a = new RegExp("[\uDF06\uFB94]mnopqr").toAutomaton(); + assertAutomatonHits(1, a); + } + + /** + * Try seeking to an medial/final trail surrogate. + */ + public void testSeekSurrogate4() throws IOException { + Automaton a = new RegExp("a[\uDF06\uFB94]bc").toAutomaton(); + assertAutomatonHits(1, a); + } + + /** + * Ensure the 'constant suffix' does not contain a leading trail surrogate. + */ + public void testSurrogateSuffix() throws IOException { + Automaton a = new RegExp(".*[\uD865\uD866]\uDF06ghijkl").toAutomaton(); + assertAutomatonHits(1, a); + } + + /** + * Try when the constant suffix is only a leading trail surrogate. + * instead this must use an empty suffix. + */ + public void testSurrogateSuffix2() throws IOException { + Automaton a = new RegExp(".*\uDF05").toAutomaton(); + assertAutomatonHits(1, a); + } +} Property changes on: src\test\org\apache\lucene\search\TestAutomatonQueryUnicode.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/test/org/apache/lucene/search/TestNumericRangeQuery32.java =================================================================== --- src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (revision 887534) +++ src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (working copy) @@ -444,18 +444,16 @@ lower, upper, true, true); FilteredTermsEnum termEnum = q.getTermsEnum(searcher.getIndexReader()); int count = 0; - if (!termEnum.empty()) { - do { - final TermRef t = termEnum.term(); - if (t != null) { - final int val = NumericUtils.prefixCodedToInt(t.toString()); - assertTrue("value not in bounds " + val + " >= " + lower + " && " - + val + " <= " + upper, val >= lower && val <= upper); - count++; - } else - break; - } while (termEnum.next() != null); - } + while (termEnum.next() != null) { + final TermRef t = termEnum.term(); + if (t != null) { + final int val = NumericUtils.prefixCodedToInt(t.toString()); + assertTrue("value not in bounds " + val + " >= " + lower + " && " + + val + " <= " + upper, val >= lower && val <= upper); + count++; + } else + break; + } assertNull(termEnum.next()); System.out.println("TermEnum on 'field4' for range [" + lower + "," + upper + "] contained " + count + " terms."); Index: src/test/org/apache/lucene/search/TestRegexpQuery.java =================================================================== --- src/test/org/apache/lucene/search/TestRegexpQuery.java (revision 0) +++ src/test/org/apache/lucene/search/TestRegexpQuery.java (revision 0) @@ -0,0 +1,124 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.AutomatonProvider; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Some simple regex tests, mostly converted from contrib's TestRegexQuery. + */ +public class TestRegexpQuery extends LuceneTestCase { + private IndexSearcher searcher; + private final String FN = "field"; + + public void setUp() throws Exception { + super.setUp(); + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), + true, IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field(FN, + "the quick brown fox jumps over the lazy ??? dog 493432 49344", + Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(directory, true); + } + + public void tearDown() throws Exception { + searcher.close(); + super.tearDown(); + } + + private Term newTerm(String value) { return new Term(FN, value); } + + private int regexQueryNrHits(String regex) throws IOException { + RegexpQuery query = new RegexpQuery( newTerm(regex)); + return searcher.search(query, 5).totalHits; + } + + public void testRegex1() throws IOException { + assertEquals(1, regexQueryNrHits("q.[aeiou]c.*")); + } + + public void testRegex2() throws IOException { + assertEquals(0, regexQueryNrHits(".[aeiou]c.*")); + } + + public void testRegex3() throws IOException { + assertEquals(0, regexQueryNrHits("q.[aeiou]c")); + } + + public void testNumericRange() throws IOException { + assertEquals(1, regexQueryNrHits("<420000-600000>")); + assertEquals(0, regexQueryNrHits("<493433-600000>")); + } + + public void testRegexComplement() throws IOException { + assertEquals(1, regexQueryNrHits("4934~[3]")); + // not the empty lang, i.e. match all docs + assertEquals(1, regexQueryNrHits("~#")); + } + + public void testCustomProvider() throws IOException { + AutomatonProvider myProvider = new AutomatonProvider() { + // automaton that matches quick or brown + private Automaton quickBrownAutomaton = BasicOperations.union( + Arrays.asList(new Automaton[] { + BasicAutomata.makeString("quick"), + BasicAutomata.makeString("brown"), + BasicAutomata.makeString("bob")})); + + public Automaton getAutomaton(String name) throws IOException { + if (name.equals("quickBrown")) + return quickBrownAutomaton; + else + return null; + } + }; + RegexpQuery query = new RegexpQuery(newTerm(""), RegExp.ALL, myProvider); + assertEquals(1, searcher.search(query, 5).totalHits); + } + + /** + * Test a corner case for backtracking: + * In this case the term dictionary has 493432 followed by 49344. + * When backtracking from 49343... to 4934, its necessary + * to test that 4934 itself is ok before trying to append more characters. + */ + public void testBacktracking() throws IOException { + assertEquals(1, regexQueryNrHits("4934[314]")); + } +} + Property changes on: src\test\org\apache\lucene\search\TestRegexpQuery.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/test/org/apache/lucene/search/TestRegexpRandom.java =================================================================== --- src/test/org/apache/lucene/search/TestRegexpRandom.java (revision 0) +++ src/test/org/apache/lucene/search/TestRegexpRandom.java (revision 0) @@ -0,0 +1,144 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.Random; + +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Create an index with terms from 0000-9999. + * Generates random regexps according to simple patterns, + * and validates the correct number of hits are returned. + */ +public class TestRegexpRandom extends LuceneTestCase { + private Searcher searcher; + private Random random; + + @Override + protected void setUp() throws Exception { + super.setUp(); + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + + Document doc = new Document(); + Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field); + + NumberFormat df = new DecimalFormat("0000"); + for (int i = 0; i < 10000; i++) { + field.setValue(df.format(i)); + writer.addDocument(doc); + } + + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(dir); + } + + private char N() { + return (char) (0x30 + random.nextInt(10)); + } + + private String fillPattern(String wildcardPattern) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < wildcardPattern.length(); i++) { + switch(wildcardPattern.charAt(i)) { + case 'N': + sb.append(N()); + break; + default: + sb.append(wildcardPattern.charAt(i)); + } + } + return sb.toString(); + } + + private void assertPatternHits(String pattern, int numHits) throws Exception { + Query wq = new RegexpQuery(new Term("field", fillPattern(pattern))); + TopDocs docs = searcher.search(wq, 25); + assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits); + } + + @Override + protected void tearDown() throws Exception { + searcher.close(); + super.tearDown(); + } + + public void testRegexps() throws Exception { + random = newRandom(System.nanoTime()); + for (int i = 0; i < 100; i++) { + assertPatternHits("NNNN", 1); + assertPatternHits(".NNN", 10); + assertPatternHits("N.NN", 10); + assertPatternHits("NN.N", 10); + assertPatternHits("NNN.", 10); + } + + for (int i = 0; i < 10; i++) { + assertPatternHits(".{1,2}NN", 100); + assertPatternHits("N.{1,2}N", 100); + assertPatternHits("NN.{1,2}", 100); + assertPatternHits(".{1,3}N", 1000); + assertPatternHits("N.{1,3}", 1000); + assertPatternHits(".{1,4}", 10000); + + assertPatternHits("NNN[3-7]", 5); + assertPatternHits("NN[2-6][3-7]", 25); + assertPatternHits("N[1-5][2-6][3-7]", 125); + assertPatternHits("[0-4][3-7][4-8][5-9]", 625); + assertPatternHits("[3-7][2-6][0-4]N", 125); + assertPatternHits("[2-6][3-7]NN", 25); + assertPatternHits("[3-7]NNN", 5); + + assertPatternHits("NNN.*", 10); + assertPatternHits("NN.*", 100); + assertPatternHits("N.*", 1000); + assertPatternHits(".*", 10000); + + assertPatternHits(".*NNN", 10); + assertPatternHits(".*NN", 100); + assertPatternHits(".*N", 1000); + + assertPatternHits("N.*NN", 10); + assertPatternHits("NN.*N", 10); + + // combo of ? and * operators + assertPatternHits(".NN.*", 100); + assertPatternHits("N.N.*", 100); + assertPatternHits("NN..*", 100); + assertPatternHits(".N..*", 1000); + assertPatternHits("N...*", 1000); + + assertPatternHits(".*NN.", 100); + assertPatternHits(".*N..", 1000); + assertPatternHits(".*...", 10000); + assertPatternHits(".*.N.", 1000); + assertPatternHits(".*..N", 1000); + } + } +} Property changes on: src\test\org\apache\lucene\search\TestRegexpRandom.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/test/org/apache/lucene/search/TestWildcard.java =================================================================== --- src/test/org/apache/lucene/search/TestWildcard.java (revision 887534) +++ src/test/org/apache/lucene/search/TestWildcard.java (working copy) @@ -24,6 +24,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.Index; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.QueryParser; @@ -119,31 +120,6 @@ MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*")); assertMatches(searcher, wq, 2); - - MultiTermQuery expected = new PrefixQuery(new Term("field", "prefix")); - wq.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); - wq.setBoost(0.1F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); - - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); - wq.setBoost(0.2F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); - - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); - wq.setBoost(0.3F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); - - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); - wq.setBoost(0.4F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); } /** @@ -326,4 +302,57 @@ searcher.close(); } + @Deprecated + private static final class OldWildcardQuery extends MultiTermQuery { + final Term term; + + OldWildcardQuery(Term term) { + this.term = term; + } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new WildcardTermEnum(reader, term); + } + + @Override + public String toString(String field) { + return "OldWildcard(" + term.toString()+ ")"; + } + } + + @Deprecated + public void testDeprecatedTermEnum() throws Exception { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals"}); + IndexSearcher searcher = new IndexSearcher(indexStore, true); + Query query1 = new TermQuery(new Term("body", "metal")); + Query query2 = new OldWildcardQuery(new Term("body", "metal*")); + Query query3 = new OldWildcardQuery(new Term("body", "m*tal")); + Query query4 = new OldWildcardQuery(new Term("body", "m*tal*")); + Query query5 = new OldWildcardQuery(new Term("body", "m*tals")); + + BooleanQuery query6 = new BooleanQuery(); + query6.add(query5, BooleanClause.Occur.SHOULD); + + BooleanQuery query7 = new BooleanQuery(); + query7.add(query3, BooleanClause.Occur.SHOULD); + query7.add(query5, BooleanClause.Occur.SHOULD); + + // Queries do not automatically lower-case search terms: + Query query8 = new OldWildcardQuery(new Term("body", "M*tal*")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 2); + assertMatches(searcher, query3, 1); + assertMatches(searcher, query4, 2); + assertMatches(searcher, query5, 1); + assertMatches(searcher, query6, 1); + assertMatches(searcher, query7, 2); + assertMatches(searcher, query8, 0); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tall")), 0); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal")), 1); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal*")), 2); + } + } Index: src/test/org/apache/lucene/search/TestWildcardRandom.java =================================================================== --- src/test/org/apache/lucene/search/TestWildcardRandom.java (revision 0) +++ src/test/org/apache/lucene/search/TestWildcardRandom.java (revision 0) @@ -0,0 +1,136 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.Random; + +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Create an index with terms from 0000-9999. + * Generates random wildcards according to patterns, + * and validates the correct number of hits are returned. + */ +public class TestWildcardRandom extends LuceneTestCase { + private Searcher searcher; + private Random random; + + @Override + protected void setUp() throws Exception { + super.setUp(); + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + + Document doc = new Document(); + Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field); + + NumberFormat df = new DecimalFormat("0000"); + for (int i = 0; i < 10000; i++) { + field.setValue(df.format(i)); + writer.addDocument(doc); + } + + writer.optimize(); + writer.close(); + searcher = new IndexSearcher(dir); + } + + private char N() { + return (char) (0x30 + random.nextInt(10)); + } + + private String fillPattern(String wildcardPattern) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < wildcardPattern.length(); i++) { + switch(wildcardPattern.charAt(i)) { + case 'N': + sb.append(N()); + break; + default: + sb.append(wildcardPattern.charAt(i)); + } + } + return sb.toString(); + } + + private void assertPatternHits(String pattern, int numHits) throws Exception { + Query wq = new WildcardQuery(new Term("field", fillPattern(pattern))); + TopDocs docs = searcher.search(wq, 25); + assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits); + } + + @Override + protected void tearDown() throws Exception { + searcher.close(); + super.tearDown(); + } + + public void testWildcards() throws Exception { + random = newRandom(System.nanoTime()); + for (int i = 0; i < 100; i++) { + assertPatternHits("NNNN", 1); + assertPatternHits("?NNN", 10); + assertPatternHits("N?NN", 10); + assertPatternHits("NN?N", 10); + assertPatternHits("NNN?", 10); + } + + for (int i = 0; i < 10; i++) { + assertPatternHits("??NN", 100); + assertPatternHits("N??N", 100); + assertPatternHits("NN??", 100); + assertPatternHits("???N", 1000); + assertPatternHits("N???", 1000); + assertPatternHits("????", 10000); + + assertPatternHits("NNN*", 10); + assertPatternHits("NN*", 100); + assertPatternHits("N*", 1000); + assertPatternHits("*", 10000); + + assertPatternHits("*NNN", 10); + assertPatternHits("*NN", 100); + assertPatternHits("*N", 1000); + + assertPatternHits("N*NN", 10); + assertPatternHits("NN*N", 10); + + // combo of ? and * operators + assertPatternHits("?NN*", 100); + assertPatternHits("N?N*", 100); + assertPatternHits("NN?*", 100); + assertPatternHits("?N?*", 1000); + assertPatternHits("N??*", 1000); + + assertPatternHits("*NN?", 100); + assertPatternHits("*N??", 1000); + assertPatternHits("*???", 10000); + assertPatternHits("*?N?", 1000); + assertPatternHits("*??N", 1000); + } + } +} Property changes on: src\test\org\apache\lucene\search\TestWildcardRandom.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native