Index: NOTICE.txt =================================================================== --- NOTICE.txt (revision 882888) +++ NOTICE.txt (working copy) @@ -28,3 +28,6 @@ ICU4J, (under contrib/collation) is licensed under an MIT styles license (contrib/collation/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Brics Automaton (under src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ Index: src/java/org/apache/lucene/search/AutomatonQuery.java =================================================================== --- src/java/org/apache/lucene/search/AutomatonQuery.java (revision 0) +++ src/java/org/apache/lucene/search/AutomatonQuery.java (revision 0) @@ -0,0 +1,155 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FilteredTermEnum; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.SingleTermEnum; +import org.apache.lucene.util.ToStringUtils; + +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.MinimizationOperations; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +/** + * A {@link Query} that will match terms against a finite-state machine. + *
+ * This query will match documents that contain terms accepted by a given + * finite-state machine. The automaton can be constructed with the + * {@link org.apache.lucene.util.automaton} API. Alternatively, it can be + * created from a regular expression with {@link RegexpQuery} or from + * the standard Lucene wildcard syntax with {@link WildcardQuery}. + *
+ *
+ * When the query is executed, it will create an equivalent minimal DFA of the
+ * finite-state machine, and will enumerate the term dictionary in an
+ * intelligent way to reduce the number of comparisons. For example: the regular
+ * expression of [dl]og? will make approximately four comparisons:
+ * do, dog, lo, and log.
+ *
+ * The algorithm is such: + *
+ * The algorithm does not attempt to actually skip to the next string that is + * completely accepted. This is not possible when the language accepted by the + * FSM is not finite (i.e. * operator). + *
+ *+ * If the DFA has a leading kleene star, or something similar, it will + * need to run against the entire term dictionary. In this case its much + * better to do just that than to use smart enumeration. + * This heuristic looks for an initial loop, with a range of at least 1/3 + * of the unicode BMP. + * Use {@link #usesLinearMode} to find out, if it enumerates all terms + * in linear mode without seeking. + *
+ */ +public class AutomatonTermEnum extends FilteredTermEnum { + private final IndexReader reader; + private final Term queryTerm; + private final RunAutomaton runAutomaton; + private final Automaton automaton; + private final boolean linearMode; + private final String commonPrefix; + // the last term that was compared + private Term lastTerm = null; + private boolean endEnum = false; + // for complex machines that must make a lot of comparisons + private final MapIn linear mode, it also sets {@link #endEnum} if the enumeration is exhausted. + * In smart mode, it will never do this. + */ + @Override + protected boolean termCompare(final Term term) { + lastTerm = term; + final String text = term.text(); + if (term.field() == queryTerm.field() && (!linearMode || text.startsWith(commonPrefix))) { + return runAutomaton.run(text); + } else { + // only set endEnum in linearMode + endEnum = linearMode; + return false; + } + } + + /** + * In smart mode, increments to the next term matching this automaton. + * After a successful comparison, it simply tries the next term. + * After an unsuccessful comparison, it seeks to a smarter position. + *
If the enum is in linear mode, it simply calls {@code super.next()} to
+ * just filter the current enum until {@link #endEnum} returns {@code true}.
+ */
+ @Override
+ public boolean next() throws IOException {
+ if (linearMode)
+ return super.next();
+
+ do {
+ /*
+ * if the previous enumeration was a match, don't even bother
+ * trying to compute the next place to seek to.
+ * this is an optimization for a DFA that matches many sequential terms,
+ * such as ab*
+ */
+ if (lastTerm == currentTerm) {
+ actualEnum.next();
+ } else {
+ // seek to the next possible string
+ String nextPoint = nextString(lastTerm.text());
+ if (nextPoint == null) { // no more possible strings can match
+ currentTerm = null;
+ endEnum = true;
+ return false;
+ }
+ // replace the old enumerator with a new one, positioned to a nice place
+ actualEnum.close();
+ actualEnum = reader.terms(lastTerm.createTerm(nextPoint));
+ }
+
+ Term candidateTerm = actualEnum.term(); // read a term
+
+ /*
+ * this means end of enumeration: no more terms for this field or no more
+ * terms at all
+ */
+ if (candidateTerm == null || candidateTerm.field() != queryTerm.field()) {
+ currentTerm = null;
+ endEnum = true;
+ return false;
+ }
+
+ // if the term matches the automaton, success!
+ if (termCompare(candidateTerm)) {
+ currentTerm = candidateTerm;
+ return true;
+ }
+ } while (true);
+ }
+
+ /**
+ * This method should only be called in linear mode, in smart
+ * mode the result is undefined, as the handling of exhausted enums
+ * is done inside {@link #next}.
+ */
+ @Override
+ protected boolean endEnum() {
+ assert linearMode : "endEnum() should only be called in linear mode";
+ return endEnum;
+ }
+
+ /**
+ * Returns the next String in lexicographic order after s that will not put
+ * the machine into a reject state. If such a string does not exist, returns
+ * null.
+ *
+ * The correctness of this method depends upon the automaton being deterministic,
+ * and having no transitions to dead states.
+ *
+ * @param s input String
+ * @return next valid String
+ */
+ private String nextString(String s) {
+ State state = automaton.getInitialState();
+ int pos = 0;
+
+ while (true) {
+ // walk the automaton until a character is rejected.
+ for (pos = 0; pos < s.length(); pos++) {
+ State nextState = step(state, s.charAt(pos));
+ if (nextState == null)
+ break;
+ else
+ state = nextState;
+ }
+
+ // take the useful portion, and the last non-reject state, and attempt to
+ // append characters that will match.
+ String nextString = nextString(s, state, pos);
+ if (nextString != null) {
+ return nextString;
+ } else { /* no more solutions exist from this useful portion, backtrack */
+ if (pos == 0) /* all solutions exhausted */
+ return null;
+ char nextChar = s.charAt(pos - 1);
+ nextChar++;
+ String sprime = s.substring(0, pos - 1) + nextChar;
+ // if this is accepted it is good to go as-is.
+ if (runAutomaton.run(sprime))
+ return sprime;
+ else
+ s = sprime;
+ }
+ }
+ }
+
+ /**
+ * Returns the next String in lexicographic order after s that will not put
+ * the machine into a reject state. Appends some characters to the useful
+ * portion. If this cannot satisfy the machine, returns null. This method will
+ * walk the minimal path, in lexicographic order, as long as possible.
+ *
+ * @param s input String
+ * @param state current non-reject state
+ * @param useful most useful portion of the string
+ * @return next valid String
+ */
+ private String nextString(String s, State state, int useful) {
+ /*
+ * the next lexicographic character must be greater than the existing
+ * character, if it exists.
+ */
+ char c = 0;
+ if (useful < s.length()) {
+ c = s.charAt(useful);
+ c++; // cannot overflow as U+FFFF cannot be in the index.
+ }
+
+ StringBuilder sb = new StringBuilder();
+ // append the useful portion
+ sb.append(s, 0, useful);
+
+ Set
+ * The supported syntax is documented in the {@link RegExp} class.
+ *
+ * Note this query can be slow, as it needs to iterate over many terms. In order
+ * to prevent extremely slow RegexpQueries, a Regexp term should not start with
+ * the expression
+ * By default, all regular expression features are enabled.
+ * This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
* rewrite method.
*
- * @see WildcardTermEnum */
-public class WildcardQuery extends MultiTermQuery {
- private boolean termContainsWildcard;
- private boolean termIsPrefix;
- protected Term term;
-
+ * @see AutomatonQuery */
+public class WildcardQuery extends AutomatonQuery {
+ /**
+ * Constructs a query for terms matching
* Term enumerations are always ordered by Term.compareTo(). Each term in
* the enumeration is greater than all that precede it.
+ * @deprecated This class will be removed in Lucene 4.0
*/
-public class WildcardTermEnum extends FilteredTermEnum {
- final Term searchTerm;
- final String field;
- final String text;
- final String pre;
- final int preLen;
- boolean endEnum = false;
+public class WildcardTermEnum extends AutomatonTermEnum {
/**
* Creates a new Note: This method is no longer used by this class!
+ * It is dead code and only available for backwards compatibility.
*/
public static final boolean wildcardEquals(String pattern, int patternIdx,
String string, int stringIdx)
Index: src/java/org/apache/lucene/util/automaton/Automaton.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/Automaton.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/Automaton.java (revision 0)
@@ -0,0 +1,819 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InvalidClassException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.OptionalDataException;
+import java.io.OutputStream;
+import java.io.Serializable;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Finite-state automaton with regular expression operations.
+ *
+ * Class invariants:
+ *
+ * If the states or transitions are manipulated manually, the
+ * {@link #restoreInvariant()} and {@link #setDeterministic(boolean)} methods
+ * should be used afterwards to restore representation invariants that are
+ * assumed by the built-in automata operations.
+ *
+ *
+ * WARNING: The status of the Automaton feature is
+ * experimental. The APIs introduced here might change in the future and will
+ * not be supported anymore in such a case.
+ */
+public class Automaton implements Serializable, Cloneable {
+
+ static final long serialVersionUID = 10001;
+
+ /**
+ * Minimize using Hopcroft's O(n log n) algorithm. This is regarded as one of
+ * the most generally efficient algorithms that exist.
+ *
+ * @see #setMinimization(int)
+ */
+ public static final int MINIMIZE_HOPCROFT = 2;
+
+ /** Selects minimization algorithm (default:
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ *
+ * @see RunAutomaton#newMatcher(java.lang.CharSequence)
+ * @see RunAutomaton#newMatcher(java.lang.CharSequence, int, int)
+ */
+public class AutomatonMatcher implements MatchResult {
+
+ AutomatonMatcher(final CharSequence chars, final RunAutomaton automaton) {
+ this.chars = chars;
+ this.automaton = automaton;
+ }
+
+ private RunAutomaton automaton;
+ private CharSequence chars;
+
+ private int matchStart = -1;
+
+ private int matchEnd = -1;
+
+ /**
+ * Reset this matcher to a new CharSequence
+ */
+ public void reset(final CharSequence chars) {
+ this.chars = chars;
+ this.matchStart = -1;
+ this.matchEnd = -1;
+ }
+
+ /**
+ * Find the next matching subsequence of the input.
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public interface AutomatonProvider {
+
+ /**
+ * Returns automaton of the given name.
+ *
+ * @param name automaton name
+ * @return automaton
+ * @throws IOException if errors occur
+ */
+ public Automaton getAutomaton(String name) throws IOException;
+}
Property changes on: src\java\org\apache\lucene\util\automaton\AutomatonProvider.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/BasicAutomata.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 0)
@@ -0,0 +1,482 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Construction of basic automata.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class BasicAutomata {
+ // used by getWhitespaceAutomaton to match basic whitespace
+ private static final Automaton ws = Automaton.minimize(BasicAutomata
+ .makeCharSet(" \t\n\r").repeat());
+
+ private BasicAutomata() {}
+
+ /**
+ * Returns a new (deterministic) automaton with the empty language.
+ */
+ public static Automaton makeEmpty() {
+ Automaton a = new Automaton();
+ State s = new State();
+ a.initial = s;
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts only the empty string.
+ */
+ public static Automaton makeEmptyString() {
+ Automaton a = new Automaton();
+ a.singleton = "";
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts all strings.
+ */
+ public static Automaton makeAnyString() {
+ Automaton a = new Automaton();
+ State s = new State();
+ a.initial = s;
+ s.accept = true;
+ s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+ s));
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts any single character.
+ */
+ public static Automaton makeAnyChar() {
+ return makeCharRange(Character.MIN_VALUE, Character.MAX_VALUE);
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts a single character of
+ * the given value.
+ */
+ public static Automaton makeChar(char c) {
+ Automaton a = new Automaton();
+ a.singleton = Character.toString(c);
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts a single char whose
+ * value is in the given interval (including both end points).
+ */
+ public static Automaton makeCharRange(char min, char max) {
+ if (min == max) return makeChar(min);
+ Automaton a = new Automaton();
+ State s1 = new State();
+ State s2 = new State();
+ a.initial = s1;
+ s2.accept = true;
+ if (min <= max) s1.transitions.add(new Transition(min, max, s2));
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts a single character in
+ * the given set.
+ */
+ public static Automaton makeCharSet(String set) {
+ if (set.length() == 1) return makeChar(set.charAt(0));
+ Automaton a = new Automaton();
+ State s1 = new State();
+ State s2 = new State();
+ a.initial = s1;
+ s2.accept = true;
+ for (int i = 0; i < set.length(); i++)
+ s1.transitions.add(new Transition(set.charAt(i), s2));
+ a.deterministic = true;
+ a.reduce();
+ return a;
+ }
+
+ /**
+ * Constructs sub-automaton corresponding to decimal numbers of length
+ * x.substring(n).length().
+ */
+ private static State anyOfRightLength(String x, int n) {
+ State s = new State();
+ if (x.length() == n) s.setAccept(true);
+ else s.addTransition(new Transition('0', '9', anyOfRightLength(x, n + 1)));
+ return s;
+ }
+
+ /**
+ * Constructs sub-automaton corresponding to decimal numbers of value at least
+ * x.substring(n) and length x.substring(n).length().
+ */
+ private static State atLeast(String x, int n, Collection
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class BasicOperations {
+
+ private BasicOperations() {}
+
+ /**
+ * Returns an automaton that accepts the concatenation of the languages of the
+ * given automata.
+ *
+ * Complexity: linear in number of states.
+ */
+ static public Automaton concatenate(Automaton a1, Automaton a2) {
+ if (a1.isSingleton() && a2.isSingleton()) return BasicAutomata
+ .makeString(a1.singleton + a2.singleton);
+ if (a1 == a2) {
+ a1 = a1.cloneExpanded();
+ a2 = a2.cloneExpanded();
+ } else {
+ a1 = a1.cloneExpandedIfRequired();
+ a2 = a2.cloneExpandedIfRequired();
+ }
+ for (State s : a1.getAcceptStates()) {
+ s.accept = false;
+ s.addEpsilon(a2.initial);
+ }
+ a1.deterministic = false;
+ a1.clearHashCode();
+ a1.checkMinimizeAlways();
+ return a1;
+ }
+
+ /**
+ * Returns an automaton that accepts the concatenation of the languages of the
+ * given automata.
+ *
+ * Complexity: linear in total number of states.
+ */
+ static public Automaton concatenate(List
+ * Complexity: linear in number of states.
+ */
+ static public Automaton optional(Automaton a) {
+ a = a.cloneExpandedIfRequired();
+ State s = new State();
+ s.addEpsilon(a.initial);
+ s.accept = true;
+ a.initial = s;
+ a.deterministic = false;
+ a.clearHashCode();
+ a.checkMinimizeAlways();
+ return a;
+ }
+
+ /**
+ * Returns an automaton that accepts the Kleene star (zero or more
+ * concatenated repetitions) of the language of the given automaton. Never
+ * modifies the input automaton language.
+ *
+ * Complexity: linear in number of states.
+ */
+ static public Automaton repeat(Automaton a) {
+ a = a.cloneExpanded();
+ State s = new State();
+ s.accept = true;
+ s.addEpsilon(a.initial);
+ for (State p : a.getAcceptStates())
+ p.addEpsilon(s);
+ a.initial = s;
+ a.deterministic = false;
+ a.clearHashCode();
+ a.checkMinimizeAlways();
+ return a;
+ }
+
+ /**
+ * Returns an automaton that accepts
+ * Complexity: linear in number of states and in
+ * Complexity: linear in number of states and in
+ * Complexity: linear in number of states (if already deterministic).
+ */
+ static public Automaton complement(Automaton a) {
+ a = a.cloneExpandedIfRequired();
+ a.determinize();
+ a.totalize();
+ for (State p : a.getStates())
+ p.accept = !p.accept;
+ a.removeDeadTransitions();
+ return a;
+ }
+
+ /**
+ * Returns a (deterministic) automaton that accepts the intersection of the
+ * language of
+ * Complexity: quadratic in number of states (if already deterministic).
+ */
+ static public Automaton minus(Automaton a1, Automaton a2) {
+ if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata
+ .makeEmpty();
+ if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired();
+ if (a1.isSingleton()) {
+ if (a2.run(a1.singleton)) return BasicAutomata.makeEmpty();
+ else return a1.cloneIfRequired();
+ }
+ return intersection(a1, a2.complement());
+ }
+
+ /**
+ * Returns an automaton that accepts the intersection of the languages of the
+ * given automata. Never modifies the input automata languages.
+ *
+ * Complexity: quadratic in number of states.
+ */
+ static public Automaton intersection(Automaton a1, Automaton a2) {
+ if (a1.isSingleton()) {
+ if (a2.run(a1.singleton)) return a1.cloneIfRequired();
+ else return BasicAutomata.makeEmpty();
+ }
+ if (a2.isSingleton()) {
+ if (a1.run(a2.singleton)) return a2.cloneIfRequired();
+ else return BasicAutomata.makeEmpty();
+ }
+ if (a1 == a2) return a1.cloneIfRequired();
+ Transition[][] transitions1 = Automaton
+ .getSortedTransitions(a1.getStates());
+ Transition[][] transitions2 = Automaton
+ .getSortedTransitions(a2.getStates());
+ Automaton c = new Automaton();
+ LinkedList
+ * Complexity: quadratic in number of states.
+ */
+ public static boolean subsetOf(Automaton a1, Automaton a2) {
+ if (a1 == a2) return true;
+ if (a1.isSingleton()) {
+ if (a2.isSingleton()) return a1.singleton.equals(a2.singleton);
+ return a2.run(a1.singleton);
+ }
+ a2.determinize();
+ Transition[][] transitions1 = Automaton
+ .getSortedTransitions(a1.getStates());
+ Transition[][] transitions2 = Automaton
+ .getSortedTransitions(a2.getStates());
+ LinkedList
+ * Complexity: linear in number of states.
+ */
+ public static Automaton union(Automaton a1, Automaton a2) {
+ if ((a1.isSingleton() && a2.isSingleton() && a1.singleton
+ .equals(a2.singleton))
+ || a1 == a2) return a1.cloneIfRequired();
+ if (a1 == a2) {
+ a1 = a1.cloneExpanded();
+ a2 = a2.cloneExpanded();
+ } else {
+ a1 = a1.cloneExpandedIfRequired();
+ a2 = a2.cloneExpandedIfRequired();
+ }
+ State s = new State();
+ s.addEpsilon(a1.initial);
+ s.addEpsilon(a2.initial);
+ a1.initial = s;
+ a1.deterministic = false;
+ a1.clearHashCode();
+ a1.checkMinimizeAlways();
+ return a1;
+ }
+
+ /**
+ * Returns an automaton that accepts the union of the languages of the given
+ * automata.
+ *
+ * Complexity: linear in number of states.
+ */
+ public static Automaton union(Collection
+ * Complexity: exponential in number of states.
+ */
+ public static void determinize(Automaton a) {
+ if (a.deterministic || a.isSingleton()) return;
+ Set
+ * Complexity: linear in the length of the string.
+ *
+ * Note: for full performance, use the {@link RunAutomaton} class.
+ */
+ public static boolean run(Automaton a, String s) {
+ if (a.isSingleton()) return s.equals(a.singleton);
+ if (a.deterministic) {
+ State p = a.initial;
+ for (int i = 0; i < s.length(); i++) {
+ State q = p.step(s.charAt(i));
+ if (q == null) return false;
+ p = q;
+ }
+ return p.accept;
+ } else {
+ Set
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class MinimizationOperations {
+
+ private MinimizationOperations() {}
+
+ /**
+ * Minimizes (and determinizes if not already deterministic) the given
+ * automaton.
+ *
+ * @see Automaton#setMinimization(int)
+ */
+ public static void minimize(Automaton a) {
+ if (!a.isSingleton()) {
+ minimizeHopcroft(a);
+ }
+ // recompute hash code
+ a.hash_code = a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2;
+ if (a.hash_code == 0) a.hash_code = 1;
+ }
+
+ private static
+This package contains a full DFA/NFA implementation with Unicode
+alphabet and support for all standard (and a number of non-standard)
+regular expression operations.
+
+The most commonly used functionality is located in the classes
+{@link org.apache.lucene.util.automaton.Automaton} and
+{@link org.apache.lucene.util.automaton.RegExp}.
+
+For more information, go to the package home page at
+http://www.brics.dk/automaton/.
+
+WARNING: The status of the Automaton feature is experimental.
+The APIs introduced here might change in the future and will not be
+supported anymore in such a case.
+
+
Property changes on: src\java\org\apache\lucene\util\automaton\package.html
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/RegExp.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/RegExp.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/RegExp.java (revision 0)
@@ -0,0 +1,1003 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Regular Expression extension to
+ * Regular expressions are built from the following abstract syntax:
+ *
+ *
+ * The productions marked [OPTIONAL] are only allowed if
+ * specified by the syntax flags passed to the
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class RegExp {
+
+ enum Kind {
+ REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL
+ }
+
+ /**
+ * Syntax flag, enables intersection (&).
+ */
+ public static final int INTERSECTION = 0x0001;
+
+ /**
+ * Syntax flag, enables complement (~).
+ */
+ public static final int COMPLEMENT = 0x0002;
+
+ /**
+ * Syntax flag, enables empty language (#).
+ */
+ public static final int EMPTY = 0x0004;
+
+ /**
+ * Syntax flag, enables anystring (@).
+ */
+ public static final int ANYSTRING = 0x0008;
+
+ /**
+ * Syntax flag, enables named automata (<identifier>).
+ */
+ public static final int AUTOMATON = 0x0010;
+
+ /**
+ * Syntax flag, enables numerical intervals (
+ * <n-m>).
+ */
+ public static final int INTERVAL = 0x0020;
+
+ /**
+ * Syntax flag, enables all optional regexp syntax.
+ */
+ public static final int ALL = 0xffff;
+
+ /**
+ * Syntax flag, enables no optional regexp syntax.
+ */
+ public static final int NONE = 0x0000;
+
+ private static boolean allow_mutation = false;
+
+ Kind kind;
+ RegExp exp1, exp2;
+ String s;
+ char c;
+ int min, max, digits;
+ char from, to;
+
+ String b;
+ int flags;
+ int pos;
+
+ RegExp() {}
+
+ /**
+ * Constructs new
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class RunAutomaton implements Serializable {
+
+ static final long serialVersionUID = 20001;
+
+ int size;
+ boolean[] accept;
+ int initial;
+ int[] transitions; // delta(state,c) = transitions[state*points.length +
+ // getCharClass(c)]
+ char[] points; // char interval start points
+ int[] classmap; // map from char number to class class
+
+ /**
+ * Sets alphabet table for optimal run performance.
+ */
+ final void setAlphabet() {
+ classmap = new int[Character.MAX_VALUE - Character.MIN_VALUE + 1];
+ int i = 0;
+ for (int j = 0; j <= Character.MAX_VALUE - Character.MIN_VALUE; j++) {
+ if (i + 1 < points.length && j == points[i + 1]) i++;
+ classmap[j] = i;
+ }
+ }
+
+ /**
+ * Returns a string representation of this automaton.
+ */
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ b.append("initial state: ").append(initial).append("\n");
+ for (int i = 0; i < size; i++) {
+ b.append("state " + i);
+ if (accept[i]) b.append(" [accept]:\n");
+ else b.append(" [reject]:\n");
+ for (int j = 0; j < points.length; j++) {
+ int k = transitions[i * points.length + j];
+ if (k != -1) {
+ char min = points[j];
+ char max;
+ if (j + 1 < points.length) max = (char) (points[j + 1] - 1);
+ else max = Character.MAX_VALUE;
+ b.append(" ");
+ Transition.appendCharString(min, b);
+ if (min != max) {
+ b.append("-");
+ Transition.appendCharString(max, b);
+ }
+ b.append(" -> ").append(k).append("\n");
+ }
+ }
+ }
+ return b.toString();
+ }
+
+ /**
+ * Returns number of states in automaton.
+ */
+ public int getSize() {
+ return size;
+ }
+
+ /**
+ * Returns acceptance status for given state.
+ */
+ public boolean isAccept(int state) {
+ return accept[state];
+ }
+
+ /**
+ * Returns initial state.
+ */
+ public int getInitialState() {
+ return initial;
+ }
+
+ /**
+ * Returns array of character class interval start points. The array should
+ * not be modified by the caller.
+ */
+ public char[] getCharIntervals() {
+ return points.clone();
+ }
+
+ /**
+ * Gets character class of given char.
+ */
+ int getCharClass(char c) {
+ return SpecialOperations.findIndex(c, points);
+ }
+
+ @SuppressWarnings("unused")
+ private RunAutomaton() {}
+
+ /**
+ * Constructs a new
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class SpecialOperations {
+
+ private SpecialOperations() {}
+
+ /**
+ * Finds the largest entry whose value is less than or equal to c, or 0 if
+ * there is no such entry.
+ */
+ static int findIndex(char c, char[] points) {
+ int a = 0;
+ int b = points.length;
+ while (b - a > 1) {
+ int d = (a + b) >>> 1;
+ if (points[d] > c) b = d;
+ else if (points[d] < c) a = d;
+ else return d;
+ }
+ return a;
+ }
+
+ /**
+ * Returns true if the language of this automaton is finite.
+ */
+ public static boolean isFinite(Automaton a) {
+ if (a.isSingleton()) return true;
+ return isFinite(a.initial, new HashSet
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class State implements Serializable, Comparable
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class StatePair {
+ State s;
+ State s1;
+ State s2;
+
+ StatePair(State s, State s1, State s2) {
+ this.s = s;
+ this.s1 = s1;
+ this.s2 = s2;
+ }
+
+ /**
+ * Constructs a new state pair.
+ *
+ * @param s1 first state
+ * @param s2 second state
+ */
+ public StatePair(State s1, State s2) {
+ this.s1 = s1;
+ this.s2 = s2;
+ }
+
+ /**
+ * Returns first component of this pair.
+ *
+ * @return first state
+ */
+ public State getFirstState() {
+ return s1;
+ }
+
+ /**
+ * Returns second component of this pair.
+ *
+ * @return second state
+ */
+ public State getSecondState() {
+ return s2;
+ }
+
+ /**
+ * Checks for equality.
+ *
+ * @param obj object to compare with
+ * @return true if obj represents the same pair of states as this
+ * pair
+ */
+ @Override
+ public boolean equals(Object obj) {
+ if (obj instanceof StatePair) {
+ StatePair p = (StatePair) obj;
+ return p.s1 == s1 && p.s2 == s2;
+ } else return false;
+ }
+
+ /**
+ * Returns hash code.
+ *
+ * @return hash code
+ */
+ @Override
+ public int hashCode() {
+ return s1.hashCode() + s2.hashCode();
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\StatePair.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/Transition.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/Transition.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/Transition.java (revision 0)
@@ -0,0 +1,179 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+
+/**
+ * Automaton transition.
+ *
+ * A transition, which belongs to a source state, consists of a Unicode
+ * character interval and a destination state.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class Transition implements Serializable, Cloneable {
+
+ static final long serialVersionUID = 40001;
+
+ /*
+ * CLASS INVARIANT: min<=max
+ */
+
+ char min;
+ char max;
+
+ State to;
+
+ /**
+ * Constructs a new singleton interval transition.
+ *
+ * @param c transition character
+ * @param to destination state
+ */
+ public Transition(char c, State to) {
+ min = max = c;
+ this.to = to;
+ }
+
+ /**
+ * Constructs a new transition. Both end points are included in the interval.
+ *
+ * @param min transition interval minimum
+ * @param max transition interval maximum
+ * @param to destination state
+ */
+ public Transition(char min, char max, State to) {
+ if (max < min) {
+ char t = max;
+ max = min;
+ min = t;
+ }
+ this.min = min;
+ this.max = max;
+ this.to = to;
+ }
+
+ /** Returns minimum of this transition interval. */
+ public char getMin() {
+ return min;
+ }
+
+ /** Returns maximum of this transition interval. */
+ public char getMax() {
+ return max;
+ }
+
+ /** Returns destination of this transition. */
+ public State getDest() {
+ return to;
+ }
+
+ /**
+ * Checks for equality.
+ *
+ * @param obj object to compare with
+ * @return true if obj is a transition with same character interval
+ * and destination state as this transition.
+ */
+ @Override
+ public boolean equals(Object obj) {
+ if (obj instanceof Transition) {
+ Transition t = (Transition) obj;
+ return t.min == min && t.max == max && t.to == to;
+ } else return false;
+ }
+
+ /**
+ * Returns hash code. The hash code is based on the character interval (not
+ * the destination state).
+ *
+ * @return hash code
+ */
+ @Override
+ public int hashCode() {
+ return min * 2 + max * 3;
+ }
+
+ /**
+ * Clones this transition.
+ *
+ * @return clone with same character interval and destination state
+ */
+ @Override
+ public Transition clone() {
+ try {
+ return (Transition) super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ static void appendCharString(char c, StringBuilder b) {
+ if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.append(c);
+ else {
+ b.append("\\u");
+ String s = Integer.toHexString(c);
+ if (c < 0x10) b.append("000").append(s);
+ else if (c < 0x100) b.append("00").append(s);
+ else if (c < 0x1000) b.append("0").append(s);
+ else b.append(s);
+ }
+ }
+
+ /**
+ * Returns a string describing this state. Normally invoked via
+ * {@link Automaton#toString()}.
+ */
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ appendCharString(min, b);
+ if (min != max) {
+ b.append("-");
+ appendCharString(max, b);
+ }
+ b.append(" -> ").append(to.number);
+ return b.toString();
+ }
+
+ void appendDot(StringBuilder b) {
+ b.append(" -> ").append(to.number).append(" [label=\"");
+ appendCharString(min, b);
+ if (min != max) {
+ b.append("-");
+ appendCharString(max, b);
+ }
+ b.append("\"]\n");
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\Transition.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/TransitionComparator.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 0)
@@ -0,0 +1,80 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Comparator;
+
+/**
+ * Comparator for state {@link Transition}s that orders unicode char range
+ * transitions in lexicographic order.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+class TransitionComparator implements Comparator
+ *
+ * .*
+ *
+ * @see RegExp
+ */
+public class RegexpQuery extends AutomatonQuery {
+ /**
+ * A provider that provides no named automata
+ */
+ private static AutomatonProvider defaultProvider = new AutomatonProvider() {
+ public Automaton getAutomaton(String name) throws IOException {
+ return null;
+ }
+ };
+
+ /**
+ * Constructs a query for terms matching term.
+ * term.
+ *
+ * @param term regular expression.
+ * @param flags optional RegExp features from {@link RegExp}
+ */
+ public RegexpQuery(Term term, int flags) {
+ this(term, flags, defaultProvider);
+ }
+
+ /**
+ * Constructs a query for terms matching term.
+ *
+ * @param term regular expression.
+ * @param flags optional RegExp features from {@link RegExp}
+ * @param provider custom AutomatonProvider for named automata
+ */
+ public RegexpQuery(Term term, int flags, AutomatonProvider provider) {
+ super(term, new RegExp(term.text(), flags).toAutomaton(provider));
+ }
+}
Property changes on: src\java\org\apache\lucene\search\RegexpQuery.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/search/WildcardQuery.java
===================================================================
--- src/java/org/apache/lucene/search/WildcardQuery.java (revision 882888)
+++ src/java/org/apache/lucene/search/WildcardQuery.java (working copy)
@@ -17,65 +17,58 @@
* limitations under the License.
*/
-import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
+
import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
-import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
/** Implements the wildcard search query. Supported wildcards are *, which
* matches any character sequence (including the empty one), and ?,
* which matches any single character. Note this query can be slow, as it
* needs to iterate over many terms. In order to prevent extremely slow WildcardQueries,
- * a Wildcard term should not start with one of the wildcards * or
- * ?.
+ * a Wildcard term should not start with the wildcard *
*
* term.
+ */
public WildcardQuery(Term term) {
- this.term = term;
- String text = term.text();
- this.termContainsWildcard = (text.indexOf('*') != -1)
- || (text.indexOf('?') != -1);
- this.termIsPrefix = termContainsWildcard
- && (text.indexOf('?') == -1)
- && (text.indexOf('*') == text.length() - 1);
+ super(term, toAutomaton(term));
}
-
- @Override
- protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
- if (termContainsWildcard)
- return new WildcardTermEnum(reader, getTerm());
- else
- return new SingleTermEnum(reader, getTerm());
- }
/**
- * Returns the pattern term.
+ * Convert Lucene wildcard syntax into an automaton.
*/
- public Term getTerm() {
- return term;
- }
-
- @Override
- public Query rewrite(IndexReader reader) throws IOException {
- if (termIsPrefix) {
- MultiTermQuery rewritten = new PrefixQuery(term.createTerm(term.text()
- .substring(0, term.text().indexOf('*'))));
- rewritten.setBoost(getBoost());
- rewritten.setRewriteMethod(getRewriteMethod());
- return rewritten;
- } else {
- return super.rewrite(reader);
+ static Automaton toAutomaton(Term wildcardquery) {
+ ListWildcardTermEnum.
@@ -44,49 +39,9 @@
* valid term if such a term exists.
*/
public WildcardTermEnum(IndexReader reader, Term term) throws IOException {
- super();
- searchTerm = term;
- field = searchTerm.field();
- final String searchTermText = searchTerm.text();
-
- final int sidx = searchTermText.indexOf(WILDCARD_STRING);
- final int cidx = searchTermText.indexOf(WILDCARD_CHAR);
- int idx = sidx;
- if (idx == -1) {
- idx = cidx;
- }
- else if (cidx >= 0) {
- idx = Math.min(idx, cidx);
- }
- pre = idx != -1?searchTerm.text().substring(0,idx): "";
-
- preLen = pre.length();
- text = searchTermText.substring(preLen);
- setEnum(reader.terms(new Term(searchTerm.field(), pre)));
+ super(WildcardQuery.toAutomaton(term), term, reader);
}
- @Override
- protected final boolean termCompare(Term term) {
- if (field == term.field()) {
- String searchText = term.text();
- if (searchText.startsWith(pre)) {
- return wildcardEquals(text, 0, searchText, preLen);
- }
- }
- endEnum = true;
- return false;
- }
-
- @Override
- public float difference() {
- return 1.0f;
- }
-
- @Override
- public final boolean endEnum() {
- return endEnum;
- }
-
/********************************************
* String equality with support for wildcards
********************************************/
@@ -98,6 +53,8 @@
* Determines if a word matches a wildcard pattern.
* Work released by Granta Design Ltd after originally being done on
* company time.
+ *
+ *
+ * MINIMIZE_HOPCROFT). */
+ static int minimization = MINIMIZE_HOPCROFT;
+
+ /** Initial state of this automaton. */
+ State initial;
+
+ /**
+ * If true, then this automaton is definitely deterministic (i.e., there are
+ * no choices for any run, but a run may crash).
+ */
+ boolean deterministic;
+
+ /** Extra data associated with this automaton. */
+ transient Object info;
+
+ /**
+ * Hash code. Recomputed by {@link MinimizationOperations#minimize(Automaton)}
+ */
+ int hash_code;
+
+ /** Singleton string. Null if not applicable. */
+ String singleton;
+
+ /** Minimize always flag. */
+ static boolean minimize_always = false;
+
+ /**
+ * Selects whether operations may modify the input automata (default:
+ * false).
+ */
+ static boolean allow_mutation = false;
+
+ /**
+ * Constructs a new automaton that accepts the empty language. Using this
+ * constructor, automata can be constructed manually from {@link State} and
+ * {@link Transition} objects.
+ *
+ * @see #setInitialState(State)
+ * @see State
+ * @see Transition
+ */
+ public Automaton() {
+ initial = new State();
+ deterministic = true;
+ singleton = null;
+ }
+
+ boolean isDebug() {
+ return System.getProperty("dk.brics.automaton.debug") != null;
+ }
+
+ /**
+ * Selects minimization algorithm (default: MINIMIZE_HOPCROFT).
+ *
+ * @param algorithm minimization algorithm
+ */
+ static public void setMinimization(int algorithm) {
+ minimization = algorithm;
+ }
+
+ /**
+ * Sets or resets minimize always flag. If this flag is set, then
+ * {@link MinimizationOperations#minimize(Automaton)} will automatically be
+ * invoked after all operations that otherwise may produce non-minimal
+ * automata. By default, the flag is not set.
+ *
+ * @param flag if true, the flag is set
+ */
+ static public void setMinimizeAlways(boolean flag) {
+ minimize_always = flag;
+ }
+
+ /**
+ * Sets or resets allow mutate flag. If this flag is set, then all automata
+ * operations may modify automata given as input; otherwise, operations will
+ * always leave input automata languages unmodified. By default, the flag is
+ * not set.
+ *
+ * @param flag if true, the flag is set
+ * @return previous value of the flag
+ */
+ static public boolean setAllowMutate(boolean flag) {
+ boolean b = allow_mutation;
+ allow_mutation = flag;
+ return b;
+ }
+
+ /**
+ * Returns the state of the allow mutate flag. If this flag is set, then all
+ * automata operations may modify automata given as input; otherwise,
+ * operations will always leave input automata languages unmodified. By
+ * default, the flag is not set.
+ *
+ * @return current value of the flag
+ */
+ static boolean getAllowMutate() {
+ return allow_mutation;
+ }
+
+ void checkMinimizeAlways() {
+ if (minimize_always) MinimizationOperations.minimize(this);
+ }
+
+ boolean isSingleton() {
+ return singleton != null;
+ }
+
+ /**
+ * Returns the singleton string for this automaton. An automaton that accepts
+ * exactly one string may be represented in singleton mode. In that
+ * case, this method may be used to obtain the string.
+ *
+ * @return string, null if this automaton is not in singleton mode.
+ */
+ public String getSingleton() {
+ return singleton;
+ }
+
+ /**
+ * Sets initial state.
+ *
+ * @param s state
+ */
+ public void setInitialState(State s) {
+ initial = s;
+ singleton = null;
+ }
+
+ /**
+ * Gets initial state.
+ *
+ * @return state
+ */
+ public State getInitialState() {
+ expandSingleton();
+ return initial;
+ }
+
+ /**
+ * Returns deterministic flag for this automaton.
+ *
+ * @return true if the automaton is definitely deterministic, false if the
+ * automaton may be nondeterministic
+ */
+ public boolean isDeterministic() {
+ return deterministic;
+ }
+
+ /**
+ * Sets deterministic flag for this automaton. This method should (only) be
+ * used if automata are constructed manually.
+ *
+ * @param deterministic true if the automaton is definitely deterministic,
+ * false if the automaton may be nondeterministic
+ */
+ public void setDeterministic(boolean deterministic) {
+ this.deterministic = deterministic;
+ }
+
+ /**
+ * Associates extra information with this automaton.
+ *
+ * @param info extra information
+ */
+ public void setInfo(Object info) {
+ this.info = info;
+ }
+
+ /**
+ * Returns extra information associated with this automaton.
+ *
+ * @return extra information
+ * @see #setInfo(Object)
+ */
+ public Object getInfo() {
+ return info;
+ }
+
+ /**
+ * Returns the set of states that are reachable from the initial state.
+ *
+ * @return set of {@link State} objects
+ */
+ public SethashCode and
+ * subsetOf.
+ */
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == this) return true;
+ if (!(obj instanceof Automaton)) return false;
+ Automaton a = (Automaton) obj;
+ if (isSingleton() && a.isSingleton()) return singleton.equals(a.singleton);
+ return hashCode() == a.hashCode() && BasicOperations.subsetOf(this, a)
+ && BasicOperations.subsetOf(a, this);
+ }
+
+ /**
+ * Returns hash code for this automaton. The hash code is based on the number
+ * of states and transitions in the minimized automaton. Invoking this method
+ * may involve minimizing the automaton.
+ */
+ @Override
+ public int hashCode() {
+ if (hash_code == 0) MinimizationOperations.minimize(this);
+ return hash_code;
+ }
+
+ /**
+ * Must be invoked when the stored hash code may no longer be valid.
+ */
+ void clearHashCode() {
+ hash_code = 0;
+ }
+
+ /**
+ * Returns a string representation of this automaton.
+ */
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ if (isSingleton()) {
+ b.append("singleton: ");
+ for (char c : singleton.toCharArray())
+ Transition.appendCharString(c, b);
+ b.append("\n");
+ } else {
+ Setallow_mutation is
+ * set, expands if singleton.
+ */
+ Automaton cloneExpandedIfRequired() {
+ if (allow_mutation) {
+ expandSingleton();
+ return this;
+ } else return cloneExpanded();
+ }
+
+ /**
+ * Returns a clone of this automaton.
+ */
+ @Override
+ public Automaton clone() {
+ try {
+ Automaton a = (Automaton) super.clone();
+ if (!isSingleton()) {
+ HashMapallow_mutation flag is set.
+ */
+ Automaton cloneIfRequired() {
+ if (allow_mutation) return this;
+ else return clone();
+ }
+
+ /**
+ * Retrieves a serialized Automaton located by a URL.
+ *
+ * @param url URL of serialized automaton
+ * @exception IOException if input/output related exception occurs
+ * @exception OptionalDataException if the data is not a serialized object
+ * @exception InvalidClassException if the class serial number does not match
+ * @exception ClassCastException if the data is not a serialized
+ * Automaton
+ * @exception ClassNotFoundException if the class of the serialized object
+ * cannot be found
+ */
+ public static Automaton load(URL url) throws IOException,
+ OptionalDataException, ClassCastException, ClassNotFoundException,
+ InvalidClassException {
+ return load(url.openStream());
+ }
+
+ /**
+ * Retrieves a serialized Automaton from a stream.
+ *
+ * @param stream input stream with serialized automaton
+ * @exception IOException if input/output related exception occurs
+ * @exception OptionalDataException if the data is not a serialized object
+ * @exception InvalidClassException if the class serial number does not match
+ * @exception ClassCastException if the data is not a serialized
+ * Automaton
+ * @exception ClassNotFoundException if the class of the serialized object
+ * cannot be found
+ */
+ public static Automaton load(InputStream stream) throws IOException,
+ OptionalDataException, ClassCastException, ClassNotFoundException,
+ InvalidClassException {
+ ObjectInputStream s = new ObjectInputStream(stream);
+ return (Automaton) s.readObject();
+ }
+
+ /**
+ * Writes this Automaton to the given stream.
+ *
+ * @param stream output stream for serialized automaton
+ * @exception IOException if input/output related exception occurs
+ */
+ public void store(OutputStream stream) throws IOException {
+ ObjectOutputStream s = new ObjectOutputStream(stream);
+ s.writeObject(this);
+ s.flush();
+ }
+
+ /**
+ * See {@link BasicOperations#concatenate(Automaton, Automaton)}.
+ */
+ public Automaton concatenate(Automaton a) {
+ return BasicOperations.concatenate(this, a);
+ }
+
+ /**
+ * See {@link BasicOperations#concatenate(List)}.
+ */
+ static public Automaton concatenate(List
+ * This also updates the values for the {@code start}, {@code end}, and
+ * {@code group} methods.
+ *
+ * @return {@code true} if there is a matching subsequence.
+ */
+ public boolean find() {
+ int begin;
+ if (getMatchStart() == -2) {
+ return false;
+ } else if (getMatchStart() == -1) {
+ begin = 0;
+ } else {
+ begin = getMatchEnd();
+ }
+
+ int match_start;
+ int match_end;
+ if (automaton.isAccept(automaton.getInitialState())) {
+ match_start = begin;
+ match_end = begin;
+ } else {
+ match_start = -1;
+ match_end = -1;
+ }
+ int l = getChars().length();
+ while (begin < l) {
+ int p = automaton.getInitialState();
+ for (int i = begin; i < l; i += 1) {
+ final int new_state = automaton.step(p, getChars().charAt(i));
+ if (new_state == -1) {
+ break;
+ } else if (automaton.isAccept(new_state)) {
+ if (match_start == -1) {
+ match_start = begin;
+ }
+ match_end = i;
+ }
+ p = new_state;
+ }
+ if (match_start != -1) {
+ setMatch(match_start, match_end + 1);
+ return true;
+ }
+ begin += 1;
+ }
+ if (match_start != -1) {
+ setMatch(match_start, match_end + 1);
+ return true;
+ } else {
+ setMatch(-2, -2);
+ return false;
+ }
+ }
+
+ private void setMatch(final int matchStart, final int matchEnd)
+ throws IllegalArgumentException {
+ if (matchStart > matchEnd) {
+ throw new IllegalArgumentException(
+ "Start must be less than or equal to end: " + matchStart + ", "
+ + matchEnd);
+ }
+ this.matchStart = matchStart;
+ this.matchEnd = matchEnd;
+ }
+
+ private int getMatchStart() {
+ return matchStart;
+ }
+
+ private int getMatchEnd() {
+ return matchEnd;
+ }
+
+ private CharSequence getChars() {
+ return chars;
+ }
+
+ /**
+ * Returns the offset after the last character matched.
+ *
+ * @return The offset after the last character matched.
+ * @throws IllegalStateException if there has not been a match attempt or if
+ * the last attempt yielded no results.
+ */
+ public int end() throws IllegalStateException {
+ matchGood();
+ return matchEnd;
+ }
+
+ /**
+ * Returns the offset after the last character matched of the specified
+ * capturing group.
+ * Note that because the automaton does not support capturing groups the only
+ * valid group is 0 (the entire match).
+ *
+ * @param group the desired capturing group.
+ * @return The offset after the last character matched of the specified
+ * capturing group.
+ * @throws IllegalStateException if there has not been a match attempt or if
+ * the last attempt yielded no results.
+ * @throws IndexOutOfBoundsException if the specified capturing group does not
+ * exist in the underlying automaton.
+ */
+ public int end(final int group) throws IndexOutOfBoundsException,
+ IllegalStateException {
+ onlyZero(group);
+ return end();
+ }
+
+ /**
+ * Returns the subsequence of the input found by the previous match.
+ *
+ * @return The subsequence of the input found by the previous match.
+ * @throws IllegalStateException if there has not been a match attempt or if
+ * the last attempt yielded no results.
+ */
+ public String group() throws IllegalStateException {
+ matchGood();
+ return chars.subSequence(matchStart, matchEnd).toString();
+ }
+
+ /**
+ * Returns the subsequence of the input found by the specified capturing group
+ * during the previous match operation.
+ * Note that because the automaton does not support capturing groups the only
+ * valid group is 0 (the entire match).
+ *
+ * @param group the desired capturing group.
+ * @return The subsequence of the input found by the specified capturing group
+ * during the previous match operation the previous match. Or {@code
+ * null} if the given group did match.
+ * @throws IllegalStateException if there has not been a match attempt or if
+ * the last attempt yielded no results.
+ * @throws IndexOutOfBoundsException if the specified capturing group does not
+ * exist in the underlying automaton.
+ */
+ public String group(final int group) throws IndexOutOfBoundsException,
+ IllegalStateException {
+ onlyZero(group);
+ return group();
+ }
+
+ /**
+ * Returns the number of capturing groups in the underlying automaton.
+ * Note that because the automaton does not support capturing groups this
+ * method will always return 0.
+ *
+ * @return The number of capturing groups in the underlying automaton.
+ */
+ public int groupCount() {
+ return 0;
+ }
+
+ /**
+ * Returns the offset of the first character matched.
+ *
+ * @return The offset of the first character matched.
+ * @throws IllegalStateException if there has not been a match attempt or if
+ * the last attempt yielded no results.
+ */
+ public int start() throws IllegalStateException {
+ matchGood();
+ return matchStart;
+ }
+
+ /**
+ * Returns the offset of the first character matched of the specified
+ * capturing group.
+ * Note that because the automaton does not support capturing groups the only
+ * valid group is 0 (the entire match).
+ *
+ * @param group the desired capturing group.
+ * @return The offset of the first character matched of the specified
+ * capturing group.
+ * @throws IllegalStateException if there has not been a match attempt or if
+ * the last attempt yielded no results.
+ * @throws IndexOutOfBoundsException if the specified capturing group does not
+ * exist in the underlying automaton.
+ */
+ public int start(int group) throws IndexOutOfBoundsException,
+ IllegalStateException {
+ onlyZero(group);
+ return start();
+ }
+
+ /** Helper method that requires the group argument to be 0. */
+ private static void onlyZero(final int group)
+ throws IndexOutOfBoundsException {
+ if (group != 0) {
+ throw new IndexOutOfBoundsException("The only group supported is 0.");
+ }
+ }
+
+ /** Helper method to check that the last match attempt was valid. */
+ private void matchGood() throws IllegalStateException {
+ if ((matchStart < 0) || (matchEnd < 0)) {
+ throw new IllegalStateException("There was no available match.");
+ }
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\AutomatonMatcher.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/AutomatonProvider.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (revision 0)
@@ -0,0 +1,53 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.IOException;
+
+/**
+ * Automaton provider for RegExp.
+ * {@link RegExp#toAutomaton(AutomatonProvider)}
+ *
+ * min or more concatenated
+ * repetitions of the language of the given automaton.
+ * min.
+ */
+ static public Automaton repeat(Automaton a, int min) {
+ if (min == 0) return repeat(a);
+ Listmin and
+ * max (including both) concatenated repetitions of the language
+ * of the given automaton.
+ * min and
+ * max.
+ */
+ static public Automaton repeat(Automaton a, int min, int max) {
+ if (min > max) return BasicAutomata.makeEmpty();
+ max -= min;
+ a.expandSingleton();
+ Automaton b;
+ if (min == 0) b = BasicAutomata.makeEmptyString();
+ else if (min == 1) b = a.clone();
+ else {
+ Lista1 and the complement of the language of
+ * a2. As a side-effect, the automata may be determinized, if not
+ * already deterministic.
+ * a1 is a subset of the language
+ * of a2. As a side-effect, a2 is determinized if
+ * not already marked as deterministic.
+ * Automaton.
+ *
+ *
+ *
+ *
+ * regexp
+ * ::=
+ * unionexp
+ *
+ *
+ *
+ *
+ *
+ *
+ * |
+ *
+ *
+ *
+ *
+ *
+ * unionexp
+ * ::=
+ * interexp | unionexp
+ * (union)
+ *
+ *
+ *
+ *
+ *
+ * |
+ * interexp
+ *
+ *
+ *
+ *
+ * interexp
+ * ::=
+ * concatexp & interexp
+ * (intersection)
+ * [OPTIONAL]
+ *
+ *
+ *
+ *
+ * |
+ * concatexp
+ *
+ *
+ *
+ *
+ * concatexp
+ * ::=
+ * repeatexp concatexp
+ * (concatenation)
+ *
+ *
+ *
+ *
+ *
+ * |
+ * repeatexp
+ *
+ *
+ *
+ *
+ * repeatexp
+ * ::=
+ * repeatexp ?
+ * (zero or one occurrence)
+ *
+ *
+ *
+ *
+ * |
+ * repeatexp *
+ * (zero or more occurrences)
+ *
+ *
+ *
+ *
+ * |
+ * repeatexp +
+ * (one or more occurrences)
+ *
+ *
+ *
+ *
+ * |
+ * repeatexp {n}
+ * (n occurrences)
+ *
+ *
+ *
+ *
+ * |
+ * repeatexp {n,}
+ * (n or more occurrences)
+ *
+ *
+ *
+ *
+ * |
+ * repeatexp {n,m}
+ * (n to m occurrences, including both)
+ *
+ *
+ *
+ *
+ *
+ * |
+ * complexp
+ *
+ *
+ *
+ *
+ * complexp
+ * ::=
+ * ~ complexp
+ * (complement)
+ * [OPTIONAL]
+ *
+ *
+ *
+ *
+ * |
+ * charclassexp
+ *
+ *
+ *
+ *
+ * charclassexp
+ * ::=
+ * [ charclasses ]
+ * (character class)
+ *
+ *
+ *
+ *
+ * |
+ * [^ charclasses ]
+ * (negated character class)
+ *
+ *
+ *
+ *
+ *
+ * |
+ * simpleexp
+ *
+ *
+ *
+ *
+ * charclasses
+ * ::=
+ * charclass charclasses
+ *
+ *
+ *
+ *
+ *
+ *
+ * |
+ * charclass
+ *
+ *
+ *
+ *
+ * charclass
+ * ::=
+ * charexp - charexp
+ * (character range, including end-points)
+ *
+ *
+ *
+ *
+ *
+ * |
+ * charexp
+ *
+ *
+ *
+ *
+ * simpleexp
+ * ::=
+ * charexp
+ *
+ *
+ *
+ *
+ *
+ * |
+ * .
+ * (any single character)
+ *
+ *
+ *
+ *
+ * |
+ * #
+ * (the empty language)
+ * [OPTIONAL]
+ *
+ *
+ *
+ * |
+ * @
+ * (any string)
+ * [OPTIONAL]
+ *
+ *
+ *
+ * |
+ * " <Unicode string without double-quotes> "
+ * (a string)
+ *
+ *
+ *
+ *
+ * |
+ * ( )
+ * (the empty string)
+ *
+ *
+ *
+ *
+ * |
+ * ( unionexp )
+ * (precedence override)
+ *
+ *
+ *
+ *
+ * |
+ * < <identifier> >
+ * (named automaton)
+ * [OPTIONAL]
+ *
+ *
+ *
+ *
+ * |
+ * <n-m>
+ * (numerical interval)
+ * [OPTIONAL]
+ *
+ *
+ * charexp
+ * ::=
+ * <Unicode character>
+ * (a single non-reserved character)
+ *
+ *
+ *
+ *
+ * |
+ * \ <Unicode character>
+ * (a single character)
+ *
+ * RegExp constructor.
+ * The reserved characters used in the (enabled) syntax must be escaped with
+ * backslash (\) or double-quotes ("..."). (In
+ * contrast to other regexp syntaxes, this is required also in character
+ * classes.) Be aware that dash (-) has a special meaning in
+ * charclass expressions. An identifier is a string not containing right
+ * angle bracket (>) or dash (-). Numerical
+ * intervals are specified by non-negative decimal integers and include both end
+ * points, and if n and m have the same number
+ * of digits, then the conforming strings must have that length (i.e. prefixed
+ * by 0's).
+ *
+ * RegExp from a string. Same as
+ * RegExp(s, ALL).
+ *
+ * @param s regexp string
+ * @exception IllegalArgumentException if an error occured while parsing the
+ * regular expression
+ */
+ public RegExp(String s) throws IllegalArgumentException {
+ this(s, ALL);
+ }
+
+ /**
+ * Constructs new RegExp from a string.
+ *
+ * @param s regexp string
+ * @param syntax_flags boolean 'or' of optional syntax constructs to be
+ * enabled
+ * @exception IllegalArgumentException if an error occured while parsing the
+ * regular expression
+ */
+ public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
+ b = s;
+ flags = syntax_flags;
+ RegExp e;
+ if (s.length() == 0) e = makeString("");
+ else {
+ e = parseUnionExp();
+ if (pos < b.length()) throw new IllegalArgumentException(
+ "end-of-string expected at position " + pos);
+ }
+ kind = e.kind;
+ exp1 = e.exp1;
+ exp2 = e.exp2;
+ this.s = e.s;
+ c = e.c;
+ min = e.min;
+ max = e.max;
+ digits = e.digits;
+ from = e.from;
+ to = e.to;
+ b = null;
+ }
+
+ /**
+ * Constructs new Automaton from this RegExp. Same
+ * as toAutomaton(null) (empty automaton map).
+ */
+ public Automaton toAutomaton() {
+ return toAutomatonAllowMutate(null, null);
+ }
+
+ /**
+ * Constructs new Automaton from this RegExp. The
+ * constructed automaton is minimal and deterministic and has no transitions
+ * to dead states.
+ *
+ * @param automaton_provider provider of automata for named identifiers
+ * @exception IllegalArgumentException if this regular expression uses a named
+ * identifier that is not available from the automaton provider
+ */
+ public Automaton toAutomaton(AutomatonProvider automaton_provider)
+ throws IllegalArgumentException {
+ return toAutomatonAllowMutate(null, automaton_provider);
+ }
+
+ /**
+ * Constructs new Automaton from this RegExp. The
+ * constructed automaton is minimal and deterministic and has no transitions
+ * to dead states.
+ *
+ * @param automata a map from automaton identifiers to automata (of type
+ * Automaton).
+ * @exception IllegalArgumentException if this regular expression uses a named
+ * identifier that does not occur in the automaton map
+ */
+ public Automaton toAutomaton(MapRunAutomaton from a deterministic
+ * Automaton. Same as RunAutomaton(a, true).
+ *
+ * @param a an automaton
+ */
+ public RunAutomaton(Automaton a) {
+ this(a, true);
+ }
+
+ /**
+ * Retrieves a serialized RunAutomaton located by a URL.
+ *
+ * @param url URL of serialized automaton
+ * @exception IOException if input/output related exception occurs
+ * @exception OptionalDataException if the data is not a serialized object
+ * @exception InvalidClassException if the class serial number does not match
+ * @exception ClassCastException if the data is not a serialized
+ * RunAutomaton
+ * @exception ClassNotFoundException if the class of the serialized object
+ * cannot be found
+ */
+ public static RunAutomaton load(URL url) throws IOException,
+ OptionalDataException, ClassCastException, ClassNotFoundException,
+ InvalidClassException {
+ return load(url.openStream());
+ }
+
+ /**
+ * Retrieves a serialized RunAutomaton from a stream.
+ *
+ * @param stream input stream with serialized automaton
+ * @exception IOException if input/output related exception occurs
+ * @exception OptionalDataException if the data is not a serialized object
+ * @exception InvalidClassException if the class serial number does not match
+ * @exception ClassCastException if the data is not a serialized
+ * RunAutomaton
+ * @exception ClassNotFoundException if the class of the serialized object
+ * cannot be found
+ */
+ public static RunAutomaton load(InputStream stream) throws IOException,
+ OptionalDataException, ClassCastException, ClassNotFoundException,
+ InvalidClassException {
+ ObjectInputStream s = new ObjectInputStream(stream);
+ return (RunAutomaton) s.readObject();
+ }
+
+ /**
+ * Writes this RunAutomaton to the given stream.
+ *
+ * @param stream output stream for serialized automaton
+ * @exception IOException if input/output related exception occurs
+ */
+ public void store(OutputStream stream) throws IOException {
+ ObjectOutputStream s = new ObjectOutputStream(stream);
+ s.writeObject(this);
+ s.flush();
+ }
+
+ /**
+ * Constructs a new RunAutomaton from a deterministic
+ * Automaton. If the given automaton is not deterministic, it is
+ * determinized first.
+ *
+ * @param a an automaton
+ * @param tableize if true, a transition table is created which makes the
+ * run method faster in return of a higher memory usage
+ */
+ public RunAutomaton(Automaton a, boolean tableize) {
+ a.determinize();
+ points = a.getStartPoints();
+ SetAutomaton had no dead states, -1 is returned here if and only
+ * if a dead state is entered in an equivalent automaton with a total
+ * transition function.)
+ */
+ public int step(int state, char c) {
+ if (classmap == null) return transitions[state * points.length
+ + getCharClass(c)];
+ else return transitions[state * points.length
+ + classmap[c - Character.MIN_VALUE]];
+ }
+
+ /**
+ * Returns true if the given string is accepted by this automaton.
+ */
+ public boolean run(String s) {
+ int p = initial;
+ int l = s.length();
+ for (int i = 0; i < l; i++) {
+ p = step(p, s.charAt(i));
+ if (p == -1) return false;
+ }
+ return accept[p];
+ }
+
+ /**
+ * Returns the length of the longest accepted run of the given string starting
+ * at the given offset.
+ *
+ * @param s the string
+ * @param offset offset into s where the run starts
+ * @return length of the longest accepted run, -1 if no run is accepted
+ */
+ public int run(String s, int offset) {
+ int p = initial;
+ int l = s.length();
+ int max = -1;
+ for (int r = 0; offset <= l; offset++, r++) {
+ if (accept[p]) max = r;
+ if (offset == l) break;
+ p = step(p, s.charAt(offset));
+ if (p == -1) break;
+ }
+ return max;
+ }
+
+ /**
+ * Creates a new automaton matcher for the given input.
+ *
+ * @param s the CharSequence to search
+ * @return A new automaton matcher for the given input
+ */
+ public AutomatonMatcher newMatcher(CharSequence s) {
+ return new AutomatonMatcher(s, this);
+ }
+
+ /**
+ * Creates a new automaton matcher for the given input.
+ *
+ * @param s the CharSequence to search
+ * @param startOffset the starting offset of the given character sequence
+ * @param endOffset the ending offset of the given character sequence
+ * @return A new automaton matcher for the given input
+ */
+ public AutomatonMatcher newMatcher(CharSequence s, int startOffset,
+ int endOffset) {
+ return new AutomatonMatcher(s.subSequence(startOffset, endOffset), this);
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\RunAutomaton.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/SpecialOperations.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 0)
@@ -0,0 +1,118 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.HashSet;
+
+/**
+ * Special automata operations.
+ *
+ *