Index: LICENSE.txt
===================================================================
--- LICENSE.txt (revision 888316)
+++ LICENSE.txt (working copy)
@@ -237,4 +237,34 @@
http://www.python.org/download/releases/2.4.2/license/
+Some code in src/java/org/apache/lucene/util/automaton was
+derived from Brics automaton sources available at
+www.brics.dk/automaton/. Here is the copyright from those sources:
+/*
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
Index: NOTICE.txt
===================================================================
--- NOTICE.txt (revision 888316)
+++ NOTICE.txt (working copy)
@@ -33,3 +33,6 @@
ICU4J, (under contrib/collation) is licensed under an MIT styles license
(contrib/collation/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008
International Business Machines Corporation and others
+
+Brics Automaton (under src/java/org/apache/lucene/util/automaton) is
+BSD-licensed, created by Anders M?ller. See http://www.brics.dk/automaton/
Index: src/java/org/apache/lucene/search/AutomatonQuery.java
===================================================================
--- src/java/org/apache/lucene/search/AutomatonQuery.java (revision 0)
+++ src/java/org/apache/lucene/search/AutomatonQuery.java (revision 0)
@@ -0,0 +1,151 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.MinimizationOperations;
+import org.apache.lucene.util.automaton.SpecialOperations;
+
+/**
+ * A {@link Query} that will match terms against a finite-state machine.
+ *
+ * This query will match documents that contain terms accepted by a given
+ * finite-state machine. The automaton can be constructed with the
+ * {@link org.apache.lucene.util.automaton} API. Alternatively, it can be
+ * created from a regular expression with {@link RegexpQuery} or from
+ * the standard Lucene wildcard syntax with {@link WildcardQuery}.
+ *
+ *
+ * When the query is executed, it will create an equivalent minimal DFA of the
+ * finite-state machine, and will enumerate the term dictionary in an
+ * intelligent way to reduce the number of comparisons. For example: the regular
+ * expression of [dl]og? will make approximately four comparisons:
+ * do, dog, lo, and log.
+ *
+ */
+public class AutomatonQuery extends MultiTermQuery {
+ /** the automaton to match index terms against */
+ protected Automaton automaton;
+ /** term containing the field, and possibly some pattern structure */
+ protected Term term;
+
+ /**
+ * Create a new AutomatonQuery from an {@link Automaton}.
+ *
+ * @param term Term containing field and possibly some pattern structure. The
+ * term text is ignored.
+ * @param automaton Automaton to run, terms that are accepted are considered a
+ * match.
+ */
+ public AutomatonQuery(Term term, Automaton automaton) {
+ super(term.field());
+ this.term = term;
+ this.automaton = automaton;
+ MinimizationOperations.minimize(automaton);
+ }
+
+ @Override
+ protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
+ // matches nothing
+ if (BasicOperations.isEmpty(automaton)) {
+ return new EmptyTermsEnum();
+ }
+
+ // matches all possible strings
+ if (BasicOperations.isTotal(automaton)) {
+ final Terms terms = reader.fields().terms(getField());
+ return (terms != null) ? terms.iterator() : new EmptyTermsEnum();
+ }
+
+ // matches a fixed string in singleton representation
+ String singleton = automaton.getSingleton();
+ if (singleton != null)
+ return new SingleTermsEnum(reader, term.createTerm(singleton));
+
+ // matches a fixed string in expanded representation
+ String commonPrefix = SpecialOperations.getCommonPrefix(automaton);
+ if (automaton.equals(BasicAutomata.makeString(commonPrefix))) {
+ return new SingleTermsEnum(reader, term.createTerm(commonPrefix));
+ }
+
+ // matches a constant prefix
+ Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata
+ .makeString(commonPrefix), BasicAutomata.makeAnyString());
+ if (automaton.equals(prefixAutomaton)) {
+ return new PrefixTermsEnum(reader, term.createTerm(commonPrefix));
+ }
+
+ return new AutomatonTermsEnum(automaton, term, reader);
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = super.hashCode();
+ result = prime * result + ((automaton == null) ? 0 : automaton.hashCode());
+ result = prime * result + ((term == null) ? 0 : term.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (!super.equals(obj))
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ AutomatonQuery other = (AutomatonQuery) obj;
+ if (automaton == null) {
+ if (other.automaton != null)
+ return false;
+ } else if (!automaton.equals(other.automaton))
+ return false;
+ if (term == null) {
+ if (other.term != null)
+ return false;
+ } else if (!term.equals(other.term))
+ return false;
+ return true;
+ }
+
+ @Override
+ public String toString(String field) {
+ StringBuilder buffer = new StringBuilder();
+ if (!term.field().equals(field)) {
+ buffer.append(term.field());
+ buffer.append(":");
+ }
+ buffer.append(getClass().getSimpleName());
+ buffer.append(" {");
+ buffer.append('\n');
+ buffer.append(automaton.toString());
+ buffer.append("}");
+ buffer.append(ToStringUtils.boost(getBoost()));
+ return buffer.toString();
+ }
+}
Property changes on: src\java\org\apache\lucene\search\AutomatonQuery.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/search/AutomatonTermsEnum.java
===================================================================
--- src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 0)
+++ src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 0)
@@ -0,0 +1,391 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.BitSet;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.RunAutomaton;
+import org.apache.lucene.util.automaton.SpecialOperations;
+import org.apache.lucene.util.automaton.State;
+import org.apache.lucene.util.automaton.Transition;
+
+/**
+ * A FilteredTermsEnum that enumerates terms based upon what is accepted by a
+ * DFA.
+ *
+ * The algorithm is such:
+ *
+ * - As long as matches are successful, keep reading sequentially.
+ *
- When a match fails, skip to the next string in lexicographic order that
+ * does not enter a reject state.
+ *
+ *
+ * The algorithm does not attempt to actually skip to the next string that is
+ * completely accepted. This is not possible when the language accepted by the
+ * FSM is not finite (i.e. * operator).
+ *
+ *
+ * If the DFA has a leading kleene star, or something similar, it will
+ * need to run against the entire term dictionary. In this case its much
+ * better to do just that than to use smart enumeration.
+ * This heuristic looks for an initial loop, with a range of at least 1/3
+ * of the unicode BMP.
+ * Use {@link #usesLinearMode} to find out if it enumerates all terms
+ * in linear mode without seeking.
+ *
+ *
+ * WARNING: The status of the Automaton feature is
+ * experimental. The APIs introduced here might change in the future and will
+ * not be supported anymore in such a case.
+ *
+ */
+public class AutomatonTermsEnum extends FilteredTermsEnum {
+ // the object-oriented form of the DFA
+ private final Automaton automaton;
+ // a tableized array-based form of the DFA
+ private final RunAutomaton runAutomaton;
+ // true if this enum will not seek around
+ private final boolean linearMode;
+ // common suffix of the automaton
+ private final TermRef commonSuffixRef;
+ // true if the automaton accepts a finite language
+ private final boolean finite;
+ // array of sorted transitions for each state, indexed by state number
+ private final Transition[][] allTransitions;
+ // for path tracking: each bit is a numbered state
+ private final BitSet visited;
+ // used for unicode conversion from TermRef byte[] to char[]
+ private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+ // used for unicode conversion from char[] to TermRef byte[]
+ private final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
+ // the reference used for seeking forwards through the term dictionary
+ private final TermRef seekTermRef = new TermRef();
+ // the field being enumerated
+ private final String field;
+
+ // this accept stati will be returned by accept() dependent on internal mode
+ private final AcceptStatus NO_MATCH, YES_MATCH;
+
+ /**
+ * Construct an enumerator based upon an automaton, enumerating the specified
+ * field, working on a supplied reader.
+ *
+ * The parameter linearMode determines whether or not it will use smart enumeration.
+ */
+ AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader, boolean linearMode)
+ throws IOException {
+ super(reader, queryTerm.field());
+ this.automaton = automaton;
+ field = queryTerm.field();
+ this.linearMode = linearMode;
+
+ /*
+ * tableize the automaton. this also ensures it is deterministic, and has no
+ * transitions to dead states. it also invokes Automaton.setStateNumbers to
+ * number the original states (this is how they are tableized)
+ */
+ runAutomaton = new RunAutomaton(this.automaton);
+
+ if (this.linearMode) {
+ // iterate all terms in linear mode
+ this.finite = false;
+ allTransitions = null;
+ visited = null;
+ commonSuffixRef = new TermRef(getValidUTF16Suffix(SpecialOperations
+ .getCommonSuffix(automaton)));
+ NO_MATCH = AcceptStatus.NO;
+ YES_MATCH = AcceptStatus.YES;
+ } else {
+ // if the automaton is finite, we will never read sequentially, but always seek.
+ this.finite = SpecialOperations.isFinite(this.automaton);
+ // in nonlinear mode, the common suffix isn't that helpful.
+ // we will seek each time anyway (and take the unicode conversion hit).
+ // its also currently expensive to calculate, because getCommonSuffix is
+ // a bit expensive.
+ commonSuffixRef = new TermRef("");
+ // build a cache of sorted transitions for every state
+ allTransitions = new Transition[runAutomaton.getSize()][];
+ for (State state : this.automaton.getStates())
+ allTransitions[state.getNumber()] = state.getSortedTransitionArray(false);
+ // used for path tracking, where each bit is a numbered state.
+ visited = new BitSet(runAutomaton.getSize());
+ NO_MATCH = AcceptStatus.NO_AND_SEEK;
+ YES_MATCH = finite ? AcceptStatus.YES_AND_SEEK : AcceptStatus.YES;
+ }
+ }
+
+ /**
+ * Construct an enumerator based upon an automaton, enumerating the specified
+ * field, working on a supplied reader.
+ *
+ * It will automagically determine whether or not to enumerate the term dictionary
+ * in a smart way, or to just do a linear scan depending upon a heuristic.
+ */
+ public AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader)
+ throws IOException {
+ this(automaton, queryTerm, reader, AutomatonTermsEnum.isSlow(automaton));
+ }
+
+ /**
+ * Heuristic to detect if an automaton will be so slow,
+ * that it is better to do a linear enumeration.
+ *
+ * A very slow automaton will simply cause a lot of wasted disk seeks.
+ * Instead in that case it is actually faster to do a linear enumeration.
+ *
+ * @param automaton automaton
+ * @return true if it will result in bad search performance
+ */
+ private static boolean isSlow(Automaton automaton) {
+ /*
+ * If the DFA has a leading kleene star, or something similar, it will
+ * need to run against the entire term dictionary. In this case its much
+ * better to do just that than to use smart enumeration.
+ *
+ * this heuristic looks for an initial loop, with a range of at least 1/3
+ * of the unicode BMP.
+ */
+ State initialState = automaton.getInitialState();
+ boolean linearMode = false;
+ for (Transition transition : initialState.getTransitions()) {
+ if (transition.getDest() == initialState &&
+ (transition.getMax() - transition.getMin()) > (Character.MAX_VALUE / 3)) {
+ linearMode = true;
+ break;
+ }
+ }
+ return linearMode;
+ }
+
+ /**
+ * Returns {@code true} if the enum is in linear mode, {@code false} in smart mode.
+ */
+ public final boolean usesLinearMode() {
+ return linearMode;
+ }
+
+ /**
+ * Returns true if the term matches the automaton. Also stashes away the term
+ * to assist with smart enumeration.
+ *
In linear mode, it also sets {@link #endEnum} if the enumeration is exhausted.
+ * In smart mode, it will never do this.
+ */
+ @Override
+ protected AcceptStatus accept(final TermRef term) {
+ if (term.endsWith(commonSuffixRef)) {
+ UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+ return runAutomaton.run(utf16.result, 0, utf16.length) ? YES_MATCH : NO_MATCH;
+ } else {
+ return NO_MATCH;
+ }
+ }
+
+ @Override
+ protected TermRef nextSeekTerm(final TermRef term) throws IOException {
+ if (term == null) {
+ // return the first seek term
+ if (linearMode) {
+ seekTermRef.copy("");
+ } else {
+ utf16.copyText("");
+ if (!nextString())
+ return null;
+ UnicodeUtil.nextValidUTF16String(utf16);
+ UnicodeUtil.UTF16toUTF8(utf16.result, 0, utf16.length, utf8);
+ seekTermRef.bytes = utf8.result;
+ seekTermRef.offset = 0;
+ seekTermRef.length = utf8.length;
+ }
+ return seekTermRef;
+ } else if (!linearMode) {
+ // seek to the next possible string
+ UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+ if (nextString()) {
+ // reposition
+ UnicodeUtil.nextValidUTF16String(utf16);
+ UnicodeUtil.UTF16toUTF8(utf16.result, 0, utf16.length, utf8);
+ seekTermRef.bytes = utf8.result;
+ seekTermRef.offset = 0;
+ seekTermRef.length = utf8.length;
+ return seekTermRef;
+ }
+ }
+ // no more possible strings can match
+ return null;
+ }
+
+ /**
+ * Increments the utf16 buffer to the next String in lexicographic order after s that will not put
+ * the machine into a reject state. If such a string does not exist, returns
+ * false.
+ *
+ * The correctness of this method depends upon the automaton being deterministic,
+ * and having no transitions to dead states.
+ *
+ * @return true if more possible solutions exist for the DFA
+ */
+ private boolean nextString() {
+ int state;
+ int pos = 0;
+
+ while (true) {
+ state = runAutomaton.getInitialState();
+ // walk the automaton until a character is rejected.
+ for (pos = 0; pos < utf16.length; pos++) {
+ int nextState = runAutomaton.step(state, utf16.result[pos]);
+ if (nextState == -1)
+ break;
+ else
+ state = nextState;
+ }
+
+ // take the useful portion, and the last non-reject state, and attempt to
+ // append characters that will match.
+ if (nextString(state, pos)) {
+ return true;
+ } else { /* no more solutions exist from this useful portion, backtrack */
+ if (!backtrack(pos)) /* no more solutions at all */
+ return false;
+ else if (runAutomaton.run(utf16.result, 0, utf16.length))
+ /* String is good to go as-is */
+ return true;
+ /* else advance further */
+ }
+ }
+ }
+
+ /**
+ * Returns the next String in lexicographic order that will not put
+ * the machine into a reject state.
+ *
+ * This method traverses the DFA from the given position in the String,
+ * starting at the given state.
+ *
+ * If this cannot satisfy the machine, returns false. This method will
+ * walk the minimal path, in lexicographic order, as long as possible.
+ *
+ * If this method returns false, then there might still be more solutions,
+ * it is necessary to backtrack to find out.
+ *
+ * @param state current non-reject state
+ * @param position useful portion of the string
+ * @return true if more possible solutions exist for the DFA from this
+ * position
+ */
+ private boolean nextString(int state, int position) {
+ /*
+ * the next lexicographic character must be greater than the existing
+ * character, if it exists.
+ */
+ char c = 0;
+ if (position < utf16.length) {
+ c = utf16.result[position];
+ // if the next character is U+FFFF and is not part of the useful portion,
+ // then by definition it puts us in a reject state, and therefore this
+ // path is dead. there cannot be any higher transitions. backtrack.
+ if (c == '\uFFFF')
+ return false;
+ else
+ c++;
+ }
+
+ utf16.setLength(position);
+ visited.clear();
+ visited.set(state);
+
+ Transition transitions[] = allTransitions[state];
+
+ // find the minimal path (lexicographic order) that is >= c
+
+ for (int i = 0; i < transitions.length; i++) {
+ Transition transition = transitions[i];
+ if (transition.getMax() >= c) {
+ char nextChar = (char) Math.max(c, transition.getMin());
+ // append either the next sequential char, or the minimum transition
+ utf16.setLength(utf16.length + 1);
+ utf16.result[utf16.length - 1] = nextChar;
+ state = transition.getDest().getNumber();
+ /*
+ * as long as is possible, continue down the minimal path in
+ * lexicographic order. if a loop or accept state is encountered, stop.
+ */
+ while (!visited.get(state) && !runAutomaton.isAccept(state)) {
+ visited.set(state);
+ /*
+ * Note: we work with a DFA with no transitions to dead states.
+ * so the below is ok, if it is not an accept state,
+ * then there MUST be at least one transition.
+ */
+ transition = allTransitions[state][0];
+ // append the minimum transition
+ utf16.setLength(utf16.length + 1);
+ utf16.result[utf16.length - 1] = transition.getMin();
+ state = transition.getDest().getNumber();
+ }
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Attempts to backtrack thru the string after encountering a dead end
+ * at some given position. Returns false if no more possible strings
+ * can match.
+ *
+ * @param position current position in the input String
+ * @return true if more possible solutions exist for the DFA
+ */
+ private boolean backtrack(int position) {
+ while (position > 0) {
+ char nextChar = utf16.result[position - 1];
+ // if a character is U+FFFF its a dead-end too,
+ // because there is no higher character in UTF-16 sort order.
+ if (nextChar != '\uFFFF') {
+ nextChar++;
+ utf16.result[position - 1] = nextChar;
+ utf16.setLength(position);
+ return true;
+ }
+ position--;
+ }
+ return false; /* all solutions exhausted */
+ }
+
+ /**
+ * if the suffix starts with a low surrogate, remove it.
+ * This won't be quite as efficient, but can be converted to valid UTF-8
+ *
+ * This isn't nearly as complex as cleanupPosition, because its not
+ * going to use this suffix to walk any path thru the terms.
+ *
+ */
+ private String getValidUTF16Suffix(String suffix) {
+ if (suffix != null && suffix.length() > 0 &&
+ Character.isLowSurrogate(suffix.charAt(0)))
+ return suffix.substring(1);
+ else
+ return suffix;
+ }
+}
Property changes on: src\java\org\apache\lucene\search\AutomatonTermsEnum.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/search/RegexpQuery.java
===================================================================
--- src/java/org/apache/lucene/search/RegexpQuery.java (revision 0)
+++ src/java/org/apache/lucene/search/RegexpQuery.java (revision 0)
@@ -0,0 +1,105 @@
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.Term;
+
+import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonProvider;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A fast regular expression query based on the
+ * {@link org.apache.lucene.util.automaton} package.
+ *
+ * - Comparisons are fast
+ *
- The term dictionary is enumerated in an intelligent way, to avoid
+ * comparisons. See {@link AutomatonQuery} for more details.
+ *
+ *
+ * The supported syntax is documented in the {@link RegExp} class.
+ * Note this might be different than other regular expression implementations.
+ * For some alternatives with different syntax, look under contrib/regex
+ *
+ *
+ * Note this query can be slow, as it needs to iterate over many terms. In order
+ * to prevent extremely slow RegexpQueries, a Regexp term should not start with
+ * the expression .*
+ *
+ * @see RegExp
+ */
+public class RegexpQuery extends AutomatonQuery {
+ /**
+ * A provider that provides no named automata
+ */
+ private static AutomatonProvider defaultProvider = new AutomatonProvider() {
+ public Automaton getAutomaton(String name) throws IOException {
+ return null;
+ }
+ };
+
+ /**
+ * Constructs a query for terms matching term.
+ *
+ * By default, all regular expression features are enabled.
+ *
+ *
+ * @param term regular expression.
+ */
+ public RegexpQuery(Term term) {
+ this(term, RegExp.ALL);
+ }
+
+ /**
+ * Constructs a query for terms matching term.
+ *
+ * @param term regular expression.
+ * @param flags optional RegExp features from {@link RegExp}
+ */
+ public RegexpQuery(Term term, int flags) {
+ this(term, flags, defaultProvider);
+ }
+
+ /**
+ * Constructs a query for terms matching term.
+ *
+ * @param term regular expression.
+ * @param flags optional RegExp features from {@link RegExp}
+ * @param provider custom AutomatonProvider for named automata
+ */
+ public RegexpQuery(Term term, int flags, AutomatonProvider provider) {
+ super(term, new RegExp(term.text(), flags).toAutomaton(provider));
+ }
+
+ /** Prints a user-readable version of this query. */
+ @Override
+ public String toString(String field) {
+ StringBuilder buffer = new StringBuilder();
+ if (!term.field().equals(field)) {
+ buffer.append(term.field());
+ buffer.append(":");
+ }
+ buffer.append(term.text());
+ buffer.append(ToStringUtils.boost(getBoost()));
+ return buffer.toString();
+ }
+}
Property changes on: src\java\org\apache\lucene\search\RegexpQuery.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/search/WildcardQuery.java
===================================================================
--- src/java/org/apache/lucene/search/WildcardQuery.java (revision 888316)
+++ src/java/org/apache/lucene/search/WildcardQuery.java (working copy)
@@ -19,68 +19,69 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
/** Implements the wildcard search query. Supported wildcards are *, which
* matches any character sequence (including the empty one), and ?,
* which matches any single character. Note this query can be slow, as it
* needs to iterate over many terms. In order to prevent extremely slow WildcardQueries,
- * a Wildcard term should not start with one of the wildcards * or
- * ?.
+ * a Wildcard term should not start with the wildcard *
*
* This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
* rewrite method.
*
- * @see WildcardTermEnums */
-public class WildcardQuery extends MultiTermQuery {
- private boolean termContainsWildcard;
- private boolean termIsPrefix;
- protected Term term;
-
+ * @see AutomatonQuery
+ */
+public class WildcardQuery extends AutomatonQuery {
+ /** String equality with support for wildcards */
+ public static final char WILDCARD_STRING = '*';
+
+ /** Char equality with support for wildcards */
+ public static final char WILDCARD_CHAR = '?';
+
+ /**
+ * Constructs a query for terms matching term.
+ */
public WildcardQuery(Term term) {
- super(term.field());
- this.term = term;
- String text = term.text();
- this.termContainsWildcard = (text.indexOf('*') != -1)
- || (text.indexOf('?') != -1);
- this.termIsPrefix = termContainsWildcard
- && (text.indexOf('?') == -1)
- && (text.indexOf('*') == text.length() - 1);
+ super(term, toAutomaton(term));
}
- @Override
- protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
- if (termIsPrefix) {
- final String text = getTerm().text();
- final Term t = getTerm().createTerm(text.substring(0,text.length()-1));
- if (t.text().length() == 0) {
- final Terms terms = reader.fields().terms(getField());
- return (terms != null) ? terms.iterator() : new EmptyTermsEnum();
+ /**
+ * Convert Lucene wildcard syntax into an automaton.
+ */
+ static Automaton toAutomaton(Term wildcardquery) {
+ List automata = new ArrayList();
+
+ String wildcardText = wildcardquery.text();
+
+ for (int i = 0; i < wildcardText.length(); i++) {
+ final char c = wildcardText.charAt(i);
+ switch(c) {
+ case WILDCARD_STRING:
+ automata.add(BasicAutomata.makeAnyString());
+ break;
+ case WILDCARD_CHAR:
+ automata.add(BasicAutomata.makeAnyChar());
+ break;
+ default:
+ automata.add(BasicAutomata.makeChar(c));
}
- return new PrefixTermsEnum(reader, t);
}
- if (termContainsWildcard)
- return new WildcardTermsEnum(reader, getTerm());
- else
- return new SingleTermsEnum(reader, getTerm());
+
+ return BasicOperations.concatenate(automata);
}
@Override @Deprecated
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
- if (termIsPrefix) {
- final String text = getTerm().text();
- final Term t = getTerm().createTerm(text.substring(0,text.length()-1));
- return new PrefixTermEnum(reader, t);
- }
- if (termContainsWildcard)
- return new WildcardTermEnum(reader, getTerm());
- else
- return new SingleTermEnum(reader, getTerm());
+ return new WildcardTermEnum(reader, term);
}
/**
@@ -89,7 +90,7 @@
public Term getTerm() {
return term;
}
-
+
/** Prints a user-readable version of this query. */
@Override
public String toString(String field) {
@@ -102,30 +103,4 @@
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
-
- @Override
- public int hashCode() {
- final int prime = 31;
- int result = super.hashCode();
- result = prime * result + ((term == null) ? 0 : term.hashCode());
- return result;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (!super.equals(obj))
- return false;
- if (getClass() != obj.getClass())
- return false;
- WildcardQuery other = (WildcardQuery) obj;
- if (term == null) {
- if (other.term != null)
- return false;
- } else if (!term.equals(other.term))
- return false;
- return true;
- }
-
}
Index: src/java/org/apache/lucene/search/WildcardTermEnum.java
===================================================================
--- src/java/org/apache/lucene/search/WildcardTermEnum.java (revision 888316)
+++ src/java/org/apache/lucene/search/WildcardTermEnum.java (working copy)
@@ -28,7 +28,7 @@
*
* Term enumerations are always ordered by Term.compareTo(). Each term in
* the enumeration is greater than all that precede it.
- * @deprecated Please use {@link WildcardTermsEnum} instead.
+ * @deprecated Please use {@link AutomatonTermsEnum} instead.
*/
@Deprecated
public class WildcardTermEnum extends FilteredTermEnum {
@@ -93,8 +93,8 @@
* String equality with support for wildcards
********************************************/
- public static final char WILDCARD_STRING = '*';
- public static final char WILDCARD_CHAR = '?';
+ public static final char WILDCARD_STRING = WildcardQuery.WILDCARD_STRING;
+ public static final char WILDCARD_CHAR = WildcardQuery.WILDCARD_CHAR;
/**
* Determines if a word matches a wildcard pattern.
Index: src/java/org/apache/lucene/util/automaton/Automaton.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/Automaton.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/Automaton.java (revision 0)
@@ -0,0 +1,748 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Finite-state automaton with regular expression operations.
+ *
+ * Class invariants:
+ *
+ * - An automaton is either represented explicitly (with {@link State} and
+ * {@link Transition} objects) or with a singleton string (see
+ * {@link #getSingleton()} and {@link #expandSingleton()}) in case the automaton
+ * is known to accept exactly one string. (Implicitly, all states and
+ * transitions of an automaton are reachable from its initial state.)
+ *
- Automata are always reduced (see {@link #reduce()}) and have no
+ * transitions to dead states (see {@link #removeDeadTransitions()}).
+ *
- If an automaton is nondeterministic, then {@link #isDeterministic()}
+ * returns false (but the converse is not required).
+ *
- Automata provided as input to operations are generally assumed to be
+ * disjoint.
+ *
+ *
+ * If the states or transitions are manipulated manually, the
+ * {@link #restoreInvariant()} and {@link #setDeterministic(boolean)} methods
+ * should be used afterwards to restore representation invariants that are
+ * assumed by the built-in automata operations.
+ *
+ *
+ * WARNING: The status of the Automaton feature is
+ * experimental. The APIs introduced here might change in the future and will
+ * not be supported anymore in such a case.
+ */
+public class Automaton implements Serializable, Cloneable {
+
+ static final long serialVersionUID = 10001;
+
+ /**
+ * Minimize using Hopcroft's O(n log n) algorithm. This is regarded as one of
+ * the most generally efficient algorithms that exist.
+ *
+ * @see #setMinimization(int)
+ */
+ public static final int MINIMIZE_HOPCROFT = 2;
+
+ /** Selects minimization algorithm (default: MINIMIZE_HOPCROFT). */
+ static int minimization = MINIMIZE_HOPCROFT;
+
+ /** Initial state of this automaton. */
+ State initial;
+
+ /**
+ * If true, then this automaton is definitely deterministic (i.e., there are
+ * no choices for any run, but a run may crash).
+ */
+ boolean deterministic;
+
+ /** Extra data associated with this automaton. */
+ transient Object info;
+
+ /**
+ * Hash code. Recomputed by {@link MinimizationOperations#minimize(Automaton)}
+ */
+ int hash_code;
+
+ /** Singleton string. Null if not applicable. */
+ String singleton;
+
+ /** Minimize always flag. */
+ static boolean minimize_always = false;
+
+ /**
+ * Selects whether operations may modify the input automata (default:
+ * false).
+ */
+ static boolean allow_mutation = false;
+
+ /**
+ * Constructs a new automaton that accepts the empty language. Using this
+ * constructor, automata can be constructed manually from {@link State} and
+ * {@link Transition} objects.
+ *
+ * @see #setInitialState(State)
+ * @see State
+ * @see Transition
+ */
+ public Automaton() {
+ initial = new State();
+ deterministic = true;
+ singleton = null;
+ }
+
+ boolean isDebug() {
+ return System.getProperty("dk.brics.automaton.debug") != null;
+ }
+
+ /**
+ * Selects minimization algorithm (default: MINIMIZE_HOPCROFT).
+ *
+ * @param algorithm minimization algorithm
+ */
+ static public void setMinimization(int algorithm) {
+ minimization = algorithm;
+ }
+
+ /**
+ * Sets or resets minimize always flag. If this flag is set, then
+ * {@link MinimizationOperations#minimize(Automaton)} will automatically be
+ * invoked after all operations that otherwise may produce non-minimal
+ * automata. By default, the flag is not set.
+ *
+ * @param flag if true, the flag is set
+ */
+ static public void setMinimizeAlways(boolean flag) {
+ minimize_always = flag;
+ }
+
+ /**
+ * Sets or resets allow mutate flag. If this flag is set, then all automata
+ * operations may modify automata given as input; otherwise, operations will
+ * always leave input automata languages unmodified. By default, the flag is
+ * not set.
+ *
+ * @param flag if true, the flag is set
+ * @return previous value of the flag
+ */
+ static public boolean setAllowMutate(boolean flag) {
+ boolean b = allow_mutation;
+ allow_mutation = flag;
+ return b;
+ }
+
+ /**
+ * Returns the state of the allow mutate flag. If this flag is set, then all
+ * automata operations may modify automata given as input; otherwise,
+ * operations will always leave input automata languages unmodified. By
+ * default, the flag is not set.
+ *
+ * @return current value of the flag
+ */
+ static boolean getAllowMutate() {
+ return allow_mutation;
+ }
+
+ void checkMinimizeAlways() {
+ if (minimize_always) MinimizationOperations.minimize(this);
+ }
+
+ boolean isSingleton() {
+ return singleton != null;
+ }
+
+ /**
+ * Returns the singleton string for this automaton. An automaton that accepts
+ * exactly one string may be represented in singleton mode. In that
+ * case, this method may be used to obtain the string.
+ *
+ * @return string, null if this automaton is not in singleton mode.
+ */
+ public String getSingleton() {
+ return singleton;
+ }
+
+ /**
+ * Sets initial state.
+ *
+ * @param s state
+ */
+ public void setInitialState(State s) {
+ initial = s;
+ singleton = null;
+ }
+
+ /**
+ * Gets initial state.
+ *
+ * @return state
+ */
+ public State getInitialState() {
+ expandSingleton();
+ return initial;
+ }
+
+ /**
+ * Returns deterministic flag for this automaton.
+ *
+ * @return true if the automaton is definitely deterministic, false if the
+ * automaton may be nondeterministic
+ */
+ public boolean isDeterministic() {
+ return deterministic;
+ }
+
+ /**
+ * Sets deterministic flag for this automaton. This method should (only) be
+ * used if automata are constructed manually.
+ *
+ * @param deterministic true if the automaton is definitely deterministic,
+ * false if the automaton may be nondeterministic
+ */
+ public void setDeterministic(boolean deterministic) {
+ this.deterministic = deterministic;
+ }
+
+ /**
+ * Associates extra information with this automaton.
+ *
+ * @param info extra information
+ */
+ public void setInfo(Object info) {
+ this.info = info;
+ }
+
+ /**
+ * Returns extra information associated with this automaton.
+ *
+ * @return extra information
+ * @see #setInfo(Object)
+ */
+ public Object getInfo() {
+ return info;
+ }
+
+ /**
+ * Returns the set of states that are reachable from the initial state.
+ *
+ * @return set of {@link State} objects
+ */
+ public Set getStates() {
+ expandSingleton();
+ Set visited;
+ if (isDebug()) visited = new LinkedHashSet();
+ else visited = new HashSet();
+ LinkedList worklist = new LinkedList();
+ worklist.add(initial);
+ visited.add(initial);
+ while (worklist.size() > 0) {
+ State s = worklist.removeFirst();
+ Collection tr;
+ if (isDebug()) tr = s.getSortedTransitions(false);
+ else tr = s.transitions;
+ for (Transition t : tr)
+ if (!visited.contains(t.to)) {
+ visited.add(t.to);
+ worklist.add(t.to);
+ }
+ }
+ return visited;
+ }
+
+ /**
+ * Returns the set of reachable accept states.
+ *
+ * @return set of {@link State} objects
+ */
+ public Set getAcceptStates() {
+ expandSingleton();
+ HashSet accepts = new HashSet();
+ HashSet visited = new HashSet();
+ LinkedList worklist = new LinkedList();
+ worklist.add(initial);
+ visited.add(initial);
+ while (worklist.size() > 0) {
+ State s = worklist.removeFirst();
+ if (s.accept) accepts.add(s);
+ for (Transition t : s.transitions)
+ if (!visited.contains(t.to)) {
+ visited.add(t.to);
+ worklist.add(t.to);
+ }
+ }
+ return accepts;
+ }
+
+ /**
+ * Assigns consecutive numbers to the given states.
+ */
+ static void setStateNumbers(Set states) {
+ int number = 0;
+ for (State s : states)
+ s.number = number++;
+ }
+
+ /**
+ * Adds transitions to explicit crash state to ensure that transition function
+ * is total.
+ */
+ void totalize() {
+ State s = new State();
+ s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+ s));
+ for (State p : getStates()) {
+ int maxi = Character.MIN_VALUE;
+ for (Transition t : p.getSortedTransitions(false)) {
+ if (t.min > maxi) p.transitions.add(new Transition((char) maxi,
+ (char) (t.min - 1), s));
+ if (t.max + 1 > maxi) maxi = t.max + 1;
+ }
+ if (maxi <= Character.MAX_VALUE) p.transitions.add(new Transition(
+ (char) maxi, Character.MAX_VALUE, s));
+ }
+ }
+
+ /**
+ * Restores representation invariant. This method must be invoked before any
+ * built-in automata operation is performed if automaton states or transitions
+ * are manipulated manually.
+ *
+ * @see #setDeterministic(boolean)
+ */
+ public void restoreInvariant() {
+ removeDeadTransitions();
+ }
+
+ /**
+ * Reduces this automaton. An automaton is "reduced" by combining overlapping
+ * and adjacent edge intervals with same destination.
+ */
+ public void reduce() {
+ if (isSingleton()) return;
+ Set states = getStates();
+ setStateNumbers(states);
+ for (State s : states) {
+ List st = s.getSortedTransitions(true);
+ s.resetTransitions();
+ State p = null;
+ int min = -1, max = -1;
+ for (Transition t : st) {
+ if (p == t.to) {
+ if (t.min <= max + 1) {
+ if (t.max > max) max = t.max;
+ } else {
+ if (p != null) s.transitions.add(new Transition((char) min,
+ (char) max, p));
+ min = t.min;
+ max = t.max;
+ }
+ } else {
+ if (p != null) s.transitions.add(new Transition((char) min,
+ (char) max, p));
+ p = t.to;
+ min = t.min;
+ max = t.max;
+ }
+ }
+ if (p != null) s.transitions
+ .add(new Transition((char) min, (char) max, p));
+ }
+ }
+
+ /**
+ * Returns sorted array of all interval start points.
+ */
+ char[] getStartPoints() {
+ Set pointset = new HashSet();
+ for (State s : getStates()) {
+ pointset.add(Character.MIN_VALUE);
+ for (Transition t : s.transitions) {
+ pointset.add(t.min);
+ if (t.max < Character.MAX_VALUE) pointset.add((char) (t.max + 1));
+ }
+ }
+ char[] points = new char[pointset.size()];
+ int n = 0;
+ for (Character m : pointset)
+ points[n++] = m;
+ Arrays.sort(points);
+ return points;
+ }
+
+ /**
+ * Returns the set of live states. A state is "live" if an accept state is
+ * reachable from it.
+ *
+ * @return set of {@link State} objects
+ */
+ public Set getLiveStates() {
+ expandSingleton();
+ return getLiveStates(getStates());
+ }
+
+ private Set getLiveStates(Set states) {
+ HashMap> map = new HashMap>();
+ for (State s : states)
+ map.put(s, new HashSet());
+ for (State s : states)
+ for (Transition t : s.transitions)
+ map.get(t.to).add(s);
+ Set live = new HashSet(getAcceptStates());
+ LinkedList worklist = new LinkedList(live);
+ while (worklist.size() > 0) {
+ State s = worklist.removeFirst();
+ for (State p : map.get(s))
+ if (!live.contains(p)) {
+ live.add(p);
+ worklist.add(p);
+ }
+ }
+ return live;
+ }
+
+ /**
+ * Removes transitions to dead states and calls {@link #reduce()} and
+ * {@link #clearHashCode()}. (A state is "dead" if no accept state is
+ * reachable from it.)
+ */
+ public void removeDeadTransitions() {
+ clearHashCode();
+ if (isSingleton()) return;
+ Set states = getStates();
+ Set live = getLiveStates(states);
+ for (State s : states) {
+ Set st = s.transitions;
+ s.resetTransitions();
+ for (Transition t : st)
+ if (live.contains(t.to)) s.transitions.add(t);
+ }
+ reduce();
+ }
+
+ /**
+ * Returns a sorted array of transitions for each state (and sets state
+ * numbers).
+ */
+ static Transition[][] getSortedTransitions(Set states) {
+ setStateNumbers(states);
+ Transition[][] transitions = new Transition[states.size()][];
+ for (State s : states)
+ transitions[s.number] = s.getSortedTransitionArray(false);
+ return transitions;
+ }
+
+ /**
+ * Expands singleton representation to normal representation. Does nothing if
+ * not in singleton representation.
+ */
+ public void expandSingleton() {
+ if (isSingleton()) {
+ State p = new State();
+ initial = p;
+ for (int i = 0; i < singleton.length(); i++) {
+ State q = new State();
+ p.transitions.add(new Transition(singleton.charAt(i), q));
+ p = q;
+ }
+ p.accept = true;
+ deterministic = true;
+ singleton = null;
+ }
+ }
+
+ /**
+ * Returns the number of states in this automaton.
+ */
+ public int getNumberOfStates() {
+ if (isSingleton()) return singleton.length() + 1;
+ return getStates().size();
+ }
+
+ /**
+ * Returns the number of transitions in this automaton. This number is counted
+ * as the total number of edges, where one edge may be a character interval.
+ */
+ public int getNumberOfTransitions() {
+ if (isSingleton()) return singleton.length();
+ int c = 0;
+ for (State s : getStates())
+ c += s.transitions.size();
+ return c;
+ }
+
+ /**
+ * Returns true if the language of this automaton is equal to the language of
+ * the given automaton. Implemented using hashCode and
+ * subsetOf.
+ */
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == this) return true;
+ if (!(obj instanceof Automaton)) return false;
+ Automaton a = (Automaton) obj;
+ if (isSingleton() && a.isSingleton()) return singleton.equals(a.singleton);
+ return hashCode() == a.hashCode() && BasicOperations.subsetOf(this, a)
+ && BasicOperations.subsetOf(a, this);
+ }
+
+ /**
+ * Returns hash code for this automaton. The hash code is based on the number
+ * of states and transitions in the minimized automaton. Invoking this method
+ * may involve minimizing the automaton.
+ */
+ @Override
+ public int hashCode() {
+ if (hash_code == 0) MinimizationOperations.minimize(this);
+ return hash_code;
+ }
+
+ /**
+ * Must be invoked when the stored hash code may no longer be valid.
+ */
+ void clearHashCode() {
+ hash_code = 0;
+ }
+
+ /**
+ * Returns a string representation of this automaton.
+ */
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ if (isSingleton()) {
+ b.append("singleton: ");
+ for (char c : singleton.toCharArray())
+ Transition.appendCharString(c, b);
+ b.append("\n");
+ } else {
+ Set states = getStates();
+ setStateNumbers(states);
+ b.append("initial state: ").append(initial.number).append("\n");
+ for (State s : states)
+ b.append(s.toString());
+ }
+ return b.toString();
+ }
+
+ /**
+ * Returns Graphviz Dot representation of this automaton.
+ */
+ public String toDot() {
+ StringBuilder b = new StringBuilder("digraph Automaton {\n");
+ b.append(" rankdir = LR;\n");
+ Set states = getStates();
+ setStateNumbers(states);
+ for (State s : states) {
+ b.append(" ").append(s.number);
+ if (s.accept) b.append(" [shape=doublecircle,label=\"\"];\n");
+ else b.append(" [shape=circle,label=\"\"];\n");
+ if (s == initial) {
+ b.append(" initial [shape=plaintext,label=\"\"];\n");
+ b.append(" initial -> ").append(s.number).append("\n");
+ }
+ for (Transition t : s.transitions) {
+ b.append(" ").append(s.number);
+ t.appendDot(b);
+ }
+ }
+ return b.append("}\n").toString();
+ }
+
+ /**
+ * Returns a clone of this automaton, expands if singleton.
+ */
+ Automaton cloneExpanded() {
+ Automaton a = clone();
+ a.expandSingleton();
+ return a;
+ }
+
+ /**
+ * Returns a clone of this automaton unless allow_mutation is
+ * set, expands if singleton.
+ */
+ Automaton cloneExpandedIfRequired() {
+ if (allow_mutation) {
+ expandSingleton();
+ return this;
+ } else return cloneExpanded();
+ }
+
+ /**
+ * Returns a clone of this automaton.
+ */
+ @Override
+ public Automaton clone() {
+ try {
+ Automaton a = (Automaton) super.clone();
+ if (!isSingleton()) {
+ HashMap m = new HashMap();
+ Set states = getStates();
+ for (State s : states)
+ m.put(s, new State());
+ for (State s : states) {
+ State p = m.get(s);
+ p.accept = s.accept;
+ if (s == initial) a.initial = p;
+ for (Transition t : s.transitions)
+ p.transitions.add(new Transition(t.min, t.max, m.get(t.to)));
+ }
+ }
+ return a;
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Returns a clone of this automaton, or this automaton itself if
+ * allow_mutation flag is set.
+ */
+ Automaton cloneIfRequired() {
+ if (allow_mutation) return this;
+ else return clone();
+ }
+
+ /**
+ * See {@link BasicOperations#concatenate(Automaton, Automaton)}.
+ */
+ public Automaton concatenate(Automaton a) {
+ return BasicOperations.concatenate(this, a);
+ }
+
+ /**
+ * See {@link BasicOperations#concatenate(List)}.
+ */
+ static public Automaton concatenate(List l) {
+ return BasicOperations.concatenate(l);
+ }
+
+ /**
+ * See {@link BasicOperations#optional(Automaton)}.
+ */
+ public Automaton optional() {
+ return BasicOperations.optional(this);
+ }
+
+ /**
+ * See {@link BasicOperations#repeat(Automaton)}.
+ */
+ public Automaton repeat() {
+ return BasicOperations.repeat(this);
+ }
+
+ /**
+ * See {@link BasicOperations#repeat(Automaton, int)}.
+ */
+ public Automaton repeat(int min) {
+ return BasicOperations.repeat(this, min);
+ }
+
+ /**
+ * See {@link BasicOperations#repeat(Automaton, int, int)}.
+ */
+ public Automaton repeat(int min, int max) {
+ return BasicOperations.repeat(this, min, max);
+ }
+
+ /**
+ * See {@link BasicOperations#complement(Automaton)}.
+ */
+ public Automaton complement() {
+ return BasicOperations.complement(this);
+ }
+
+ /**
+ * See {@link BasicOperations#minus(Automaton, Automaton)}.
+ */
+ public Automaton minus(Automaton a) {
+ return BasicOperations.minus(this, a);
+ }
+
+ /**
+ * See {@link BasicOperations#intersection(Automaton, Automaton)}.
+ */
+ public Automaton intersection(Automaton a) {
+ return BasicOperations.intersection(this, a);
+ }
+
+ /**
+ * See {@link BasicOperations#subsetOf(Automaton, Automaton)}.
+ */
+ public boolean subsetOf(Automaton a) {
+ return BasicOperations.subsetOf(this, a);
+ }
+
+ /**
+ * See {@link BasicOperations#union(Automaton, Automaton)}.
+ */
+ public Automaton union(Automaton a) {
+ return BasicOperations.union(this, a);
+ }
+
+ /**
+ * See {@link BasicOperations#union(Collection)}.
+ */
+ static public Automaton union(Collection l) {
+ return BasicOperations.union(l);
+ }
+
+ /**
+ * See {@link BasicOperations#determinize(Automaton)}.
+ */
+ public void determinize() {
+ BasicOperations.determinize(this);
+ }
+
+ /**
+ * See {@link BasicOperations#isEmptyString(Automaton)}.
+ */
+ public boolean isEmptyString() {
+ return BasicOperations.isEmptyString(this);
+ }
+
+ /**
+ * See {@link MinimizationOperations#minimize(Automaton)}. Returns the
+ * automaton being given as argument.
+ */
+ public static Automaton minimize(Automaton a) {
+ MinimizationOperations.minimize(a);
+ return a;
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\Automaton.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/AutomatonProvider.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (revision 0)
@@ -0,0 +1,53 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.IOException;
+
+/**
+ * Automaton provider for RegExp.
+ * {@link RegExp#toAutomaton(AutomatonProvider)}
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public interface AutomatonProvider {
+
+ /**
+ * Returns automaton of the given name.
+ *
+ * @param name automaton name
+ * @return automaton
+ * @throws IOException if errors occur
+ */
+ public Automaton getAutomaton(String name) throws IOException;
+}
Property changes on: src\java\org\apache\lucene\util\automaton\AutomatonProvider.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/BasicAutomata.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 0)
@@ -0,0 +1,482 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Construction of basic automata.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class BasicAutomata {
+ // used by getWhitespaceAutomaton to match basic whitespace
+ private static final Automaton ws = Automaton.minimize(BasicAutomata
+ .makeCharSet(" \t\n\r").repeat());
+
+ private BasicAutomata() {}
+
+ /**
+ * Returns a new (deterministic) automaton with the empty language.
+ */
+ public static Automaton makeEmpty() {
+ Automaton a = new Automaton();
+ State s = new State();
+ a.initial = s;
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts only the empty string.
+ */
+ public static Automaton makeEmptyString() {
+ Automaton a = new Automaton();
+ a.singleton = "";
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts all strings.
+ */
+ public static Automaton makeAnyString() {
+ Automaton a = new Automaton();
+ State s = new State();
+ a.initial = s;
+ s.accept = true;
+ s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+ s));
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts any single character.
+ */
+ public static Automaton makeAnyChar() {
+ return makeCharRange(Character.MIN_VALUE, Character.MAX_VALUE);
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts a single character of
+ * the given value.
+ */
+ public static Automaton makeChar(char c) {
+ Automaton a = new Automaton();
+ a.singleton = Character.toString(c);
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts a single char whose
+ * value is in the given interval (including both end points).
+ */
+ public static Automaton makeCharRange(char min, char max) {
+ if (min == max) return makeChar(min);
+ Automaton a = new Automaton();
+ State s1 = new State();
+ State s2 = new State();
+ a.initial = s1;
+ s2.accept = true;
+ if (min <= max) s1.transitions.add(new Transition(min, max, s2));
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts a single character in
+ * the given set.
+ */
+ public static Automaton makeCharSet(String set) {
+ if (set.length() == 1) return makeChar(set.charAt(0));
+ Automaton a = new Automaton();
+ State s1 = new State();
+ State s2 = new State();
+ a.initial = s1;
+ s2.accept = true;
+ for (int i = 0; i < set.length(); i++)
+ s1.transitions.add(new Transition(set.charAt(i), s2));
+ a.deterministic = true;
+ a.reduce();
+ return a;
+ }
+
+ /**
+ * Constructs sub-automaton corresponding to decimal numbers of length
+ * x.substring(n).length().
+ */
+ private static State anyOfRightLength(String x, int n) {
+ State s = new State();
+ if (x.length() == n) s.setAccept(true);
+ else s.addTransition(new Transition('0', '9', anyOfRightLength(x, n + 1)));
+ return s;
+ }
+
+ /**
+ * Constructs sub-automaton corresponding to decimal numbers of value at least
+ * x.substring(n) and length x.substring(n).length().
+ */
+ private static State atLeast(String x, int n, Collection initials,
+ boolean zeros) {
+ State s = new State();
+ if (x.length() == n) s.setAccept(true);
+ else {
+ if (zeros) initials.add(s);
+ char c = x.charAt(n);
+ s.addTransition(new Transition(c, atLeast(x, n + 1, initials, zeros
+ && c == '0')));
+ if (c < '9') s.addTransition(new Transition((char) (c + 1), '9',
+ anyOfRightLength(x, n + 1)));
+ }
+ return s;
+ }
+
+ /**
+ * Constructs sub-automaton corresponding to decimal numbers of value at most
+ * x.substring(n) and length x.substring(n).length().
+ */
+ private static State atMost(String x, int n) {
+ State s = new State();
+ if (x.length() == n) s.setAccept(true);
+ else {
+ char c = x.charAt(n);
+ s.addTransition(new Transition(c, atMost(x, (char) n + 1)));
+ if (c > '0') s.addTransition(new Transition('0', (char) (c - 1),
+ anyOfRightLength(x, n + 1)));
+ }
+ return s;
+ }
+
+ /**
+ * Constructs sub-automaton corresponding to decimal numbers of value between
+ * x.substring(n) and y.substring(n) and of length x.substring(n).length()
+ * (which must be equal to y.substring(n).length()).
+ */
+ private static State between(String x, String y, int n,
+ Collection initials, boolean zeros) {
+ State s = new State();
+ if (x.length() == n) s.setAccept(true);
+ else {
+ if (zeros) initials.add(s);
+ char cx = x.charAt(n);
+ char cy = y.charAt(n);
+ if (cx == cy) s.addTransition(new Transition(cx, between(x, y, n + 1,
+ initials, zeros && cx == '0')));
+ else { // cx0, use fixed number of digits (strings must be prefixed
+ * by 0's to obtain the right length) - otherwise, the number of
+ * digits is not fixed
+ * @exception IllegalArgumentException if min>max or if numbers in the
+ * interval cannot be expressed with the given fixed number of
+ * digits
+ */
+ public static Automaton makeInterval(int min, int max, int digits)
+ throws IllegalArgumentException {
+ Automaton a = new Automaton();
+ String x = Integer.toString(min);
+ String y = Integer.toString(max);
+ if (min > max || (digits > 0 && y.length() > digits)) throw new IllegalArgumentException();
+ int d;
+ if (digits > 0) d = digits;
+ else d = y.length();
+ StringBuilder bx = new StringBuilder();
+ for (int i = x.length(); i < d; i++)
+ bx.append('0');
+ bx.append(x);
+ x = bx.toString();
+ StringBuilder by = new StringBuilder();
+ for (int i = y.length(); i < d; i++)
+ by.append('0');
+ by.append(y);
+ y = by.toString();
+ Collection initials = new ArrayList();
+ a.initial = between(x, y, 0, initials, digits <= 0);
+ if (digits <= 0) {
+ ArrayList pairs = new ArrayList();
+ for (State p : initials)
+ if (a.initial != p) pairs.add(new StatePair(a.initial, p));
+ BasicOperations.addEpsilons(a, pairs);
+ a.initial.addTransition(new Transition('0', a.initial));
+ a.deterministic = false;
+ } else a.deterministic = true;
+ a.checkMinimizeAlways();
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts the single given
+ * string.
+ */
+ public static Automaton makeString(String s) {
+ Automaton a = new Automaton();
+ a.singleton = s;
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Constructs automaton that accept strings representing nonnegative integers
+ * that are not larger than the given value.
+ *
+ * @param n string representation of maximum value
+ */
+ public static Automaton makeMaxInteger(String n) {
+ int i = 0;
+ while (i < n.length() && n.charAt(i) == '0')
+ i++;
+ StringBuilder b = new StringBuilder();
+ b.append("0*(0|");
+ if (i < n.length()) b.append("[0-9]{1," + (n.length() - i - 1) + "}|");
+ maxInteger(n.substring(i), 0, b);
+ b.append(")");
+ return Automaton.minimize((new RegExp(b.toString())).toAutomaton());
+ }
+
+ private static void maxInteger(String n, int i, StringBuilder b) {
+ b.append('(');
+ if (i < n.length()) {
+ char c = n.charAt(i);
+ if (c != '0') b.append("[0-" + (char) (c - 1) + "][0-9]{"
+ + (n.length() - i - 1) + "}|");
+ b.append(c);
+ maxInteger(n, i + 1, b);
+ }
+ b.append(')');
+ }
+
+ /**
+ * Constructs automaton that accept strings representing nonnegative integers
+ * that are not less that the given value.
+ *
+ * @param n string representation of minimum value
+ */
+ public static Automaton makeMinInteger(String n) {
+ int i = 0;
+ while (i + 1 < n.length() && n.charAt(i) == '0')
+ i++;
+ StringBuilder b = new StringBuilder();
+ b.append("0*");
+ minInteger(n.substring(i), 0, b);
+ b.append("[0-9]*");
+ return Automaton.minimize((new RegExp(b.toString())).toAutomaton());
+ }
+
+ private static void minInteger(String n, int i, StringBuilder b) {
+ b.append('(');
+ if (i < n.length()) {
+ char c = n.charAt(i);
+ if (c != '9') b.append("[" + (char) (c + 1) + "-9][0-9]{"
+ + (n.length() - i - 1) + "}|");
+ b.append(c);
+ minInteger(n, i + 1, b);
+ }
+ b.append(')');
+ }
+
+ /**
+ * Constructs automaton that accept strings representing decimal numbers that
+ * can be written with at most the given number of digits. Surrounding
+ * whitespace is permitted.
+ *
+ * @param i max number of necessary digits
+ */
+ public static Automaton makeTotalDigits(int i) {
+ return Automaton.minimize((new RegExp("[ \t\n\r]*[-+]?0*([0-9]{0," + i
+ + "}|((([0-9]\\.*){0," + i + "})&@\\.@)0*)[ \t\n\r]*")).toAutomaton());
+ }
+
+ /**
+ * Constructs automaton that accept strings representing decimal numbers that
+ * can be written with at most the given number of digits in the fraction
+ * part. Surrounding whitespace is permitted.
+ *
+ * @param i max number of necessary fraction digits
+ */
+ public static Automaton makeFractionDigits(int i) {
+ return Automaton.minimize((new RegExp("[ \t\n\r]*[-+]?[0-9]+(\\.[0-9]{0,"
+ + i + "}0*)?[ \t\n\r]*")).toAutomaton());
+ }
+
+ /**
+ * Constructs automaton that accept strings representing the given integer.
+ * Surrounding whitespace is permitted.
+ *
+ * @param value string representation of integer
+ */
+ public static Automaton makeIntegerValue(String value) {
+ boolean minus = false;
+ int i = 0;
+ while (i < value.length()) {
+ char c = value.charAt(i);
+ if (c == '-') minus = true;
+ if (c >= '1' && c <= '9') break;
+ i++;
+ }
+ StringBuilder b = new StringBuilder();
+ b.append(value.substring(i));
+ if (b.length() == 0) b.append("0");
+ Automaton s;
+ if (minus) s = makeChar('-');
+ else s = makeChar('+').optional();
+ Automaton ws = getWhitespaceAutomaton();
+ return Automaton.minimize(ws.concatenate(
+ s.concatenate(makeChar('0').repeat()).concatenate(
+ makeString(b.toString()))).concatenate(ws));
+ }
+
+ /**
+ * Constructs automaton that accept strings representing the given decimal
+ * number. Surrounding whitespace is permitted.
+ *
+ * @param value string representation of decimal number
+ */
+ public static Automaton makeDecimalValue(String value) {
+ boolean minus = false;
+ int i = 0;
+ while (i < value.length()) {
+ char c = value.charAt(i);
+ if (c == '-') minus = true;
+ if ((c >= '1' && c <= '9') || c == '.') break;
+ i++;
+ }
+ StringBuilder b1 = new StringBuilder();
+ StringBuilder b2 = new StringBuilder();
+ int p = value.indexOf('.', i);
+ if (p == -1) b1.append(value.substring(i));
+ else {
+ b1.append(value.substring(i, p));
+ i = value.length() - 1;
+ while (i > p) {
+ char c = value.charAt(i);
+ if (c >= '1' && c <= '9') break;
+ i--;
+ }
+ b2.append(value.substring(p + 1, i + 1));
+ }
+ if (b1.length() == 0) b1.append("0");
+ Automaton s;
+ if (minus) s = makeChar('-');
+ else s = makeChar('+').optional();
+ Automaton d;
+ if (b2.length() == 0) d = makeChar('.')
+ .concatenate(makeChar('0').repeat(1)).optional();
+ else d = makeChar('.').concatenate(makeString(b2.toString())).concatenate(
+ makeChar('0').repeat());
+ Automaton ws = getWhitespaceAutomaton();
+ return Automaton.minimize(ws.concatenate(
+ s.concatenate(makeChar('0').repeat()).concatenate(
+ makeString(b1.toString())).concatenate(d)).concatenate(ws));
+ }
+
+ /**
+ * Constructs deterministic automaton that matches strings that contain the
+ * given substring.
+ */
+ public static Automaton makeStringMatcher(String s) {
+ Automaton a = new Automaton();
+ State[] states = new State[s.length() + 1];
+ states[0] = a.initial;
+ for (int i = 0; i < s.length(); i++)
+ states[i + 1] = new State();
+ State f = states[s.length()];
+ f.accept = true;
+ f.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+ f));
+ for (int i = 0; i < s.length(); i++) {
+ Set done = new HashSet();
+ char c = s.charAt(i);
+ states[i].transitions.add(new Transition(c, states[i + 1]));
+ done.add(c);
+ for (int j = i; j >= 1; j--) {
+ char d = s.charAt(j - 1);
+ if (!done.contains(d)
+ && s.substring(0, j - 1).equals(s.substring(i - j + 1, i))) {
+ states[i].transitions.add(new Transition(d, states[j]));
+ done.add(d);
+ }
+ }
+ char[] da = new char[done.size()];
+ int h = 0;
+ for (char w : done)
+ da[h++] = w;
+ Arrays.sort(da);
+ int from = Character.MIN_VALUE;
+ int k = 0;
+ while (from <= Character.MAX_VALUE) {
+ while (k < da.length && da[k] == from) {
+ k++;
+ from++;
+ }
+ if (from <= Character.MAX_VALUE) {
+ int to = Character.MAX_VALUE;
+ if (k < da.length) {
+ to = da[k] - 1;
+ k++;
+ }
+ states[i].transitions.add(new Transition((char) from, (char) to,
+ states[0]));
+ from = to + 2;
+ }
+ }
+ }
+ a.deterministic = true;
+ return a;
+ }
+
+ private static Automaton getWhitespaceAutomaton() {
+ return ws;
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\BasicAutomata.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/BasicOperations.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 0)
@@ -0,0 +1,624 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Basic automata operations.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class BasicOperations {
+
+ private BasicOperations() {}
+
+ /**
+ * Returns an automaton that accepts the concatenation of the languages of the
+ * given automata.
+ *
+ * Complexity: linear in number of states.
+ */
+ static public Automaton concatenate(Automaton a1, Automaton a2) {
+ if (a1.isSingleton() && a2.isSingleton()) return BasicAutomata
+ .makeString(a1.singleton + a2.singleton);
+ if (a1 == a2) {
+ a1 = a1.cloneExpanded();
+ a2 = a2.cloneExpanded();
+ } else {
+ a1 = a1.cloneExpandedIfRequired();
+ a2 = a2.cloneExpandedIfRequired();
+ }
+ for (State s : a1.getAcceptStates()) {
+ s.accept = false;
+ s.addEpsilon(a2.initial);
+ }
+ a1.deterministic = false;
+ a1.clearHashCode();
+ a1.checkMinimizeAlways();
+ return a1;
+ }
+
+ /**
+ * Returns an automaton that accepts the concatenation of the languages of the
+ * given automata.
+ *
+ * Complexity: linear in total number of states.
+ */
+ static public Automaton concatenate(List l) {
+ if (l.isEmpty()) return BasicAutomata.makeEmptyString();
+ boolean all_singleton = true;
+ for (Automaton a : l)
+ if (!a.isSingleton()) {
+ all_singleton = false;
+ break;
+ }
+ if (all_singleton) {
+ StringBuilder b = new StringBuilder();
+ for (Automaton a : l)
+ b.append(a.singleton);
+ return BasicAutomata.makeString(b.toString());
+ } else {
+ for (Automaton a : l)
+ if (BasicOperations.isEmpty(a)) return BasicAutomata.makeEmpty();
+ Set ids = new HashSet();
+ for (Automaton a : l)
+ ids.add(System.identityHashCode(a));
+ boolean has_aliases = ids.size() != l.size();
+ Automaton b = l.get(0);
+ if (has_aliases) b = b.cloneExpanded();
+ else b = b.cloneExpandedIfRequired();
+ Set ac = b.getAcceptStates();
+ boolean first = true;
+ for (Automaton a : l)
+ if (first) first = false;
+ else {
+ if (a.isEmptyString()) continue;
+ Automaton aa = a;
+ if (has_aliases) aa = aa.cloneExpanded();
+ else aa = aa.cloneExpandedIfRequired();
+ Set ns = aa.getAcceptStates();
+ for (State s : ac) {
+ s.accept = false;
+ s.addEpsilon(aa.initial);
+ if (s.accept) ns.add(s);
+ }
+ ac = ns;
+ }
+ b.deterministic = false;
+ b.clearHashCode();
+ b.checkMinimizeAlways();
+ return b;
+ }
+ }
+
+ /**
+ * Returns an automaton that accepts the union of the empty string and the
+ * language of the given automaton.
+ *
+ * Complexity: linear in number of states.
+ */
+ static public Automaton optional(Automaton a) {
+ a = a.cloneExpandedIfRequired();
+ State s = new State();
+ s.addEpsilon(a.initial);
+ s.accept = true;
+ a.initial = s;
+ a.deterministic = false;
+ a.clearHashCode();
+ a.checkMinimizeAlways();
+ return a;
+ }
+
+ /**
+ * Returns an automaton that accepts the Kleene star (zero or more
+ * concatenated repetitions) of the language of the given automaton. Never
+ * modifies the input automaton language.
+ *
+ * Complexity: linear in number of states.
+ */
+ static public Automaton repeat(Automaton a) {
+ a = a.cloneExpanded();
+ State s = new State();
+ s.accept = true;
+ s.addEpsilon(a.initial);
+ for (State p : a.getAcceptStates())
+ p.addEpsilon(s);
+ a.initial = s;
+ a.deterministic = false;
+ a.clearHashCode();
+ a.checkMinimizeAlways();
+ return a;
+ }
+
+ /**
+ * Returns an automaton that accepts min or more concatenated
+ * repetitions of the language of the given automaton.
+ *
+ * Complexity: linear in number of states and in min.
+ */
+ static public Automaton repeat(Automaton a, int min) {
+ if (min == 0) return repeat(a);
+ List as = new ArrayList();
+ while (min-- > 0)
+ as.add(a);
+ as.add(repeat(a));
+ return concatenate(as);
+ }
+
+ /**
+ * Returns an automaton that accepts between min and
+ * max (including both) concatenated repetitions of the language
+ * of the given automaton.
+ *
+ * Complexity: linear in number of states and in min and
+ * max.
+ */
+ static public Automaton repeat(Automaton a, int min, int max) {
+ if (min > max) return BasicAutomata.makeEmpty();
+ max -= min;
+ a.expandSingleton();
+ Automaton b;
+ if (min == 0) b = BasicAutomata.makeEmptyString();
+ else if (min == 1) b = a.clone();
+ else {
+ List as = new ArrayList();
+ while (min-- > 0)
+ as.add(a);
+ b = concatenate(as);
+ }
+ if (max > 0) {
+ Automaton d = a.clone();
+ while (--max > 0) {
+ Automaton c = a.clone();
+ for (State p : c.getAcceptStates())
+ p.addEpsilon(d.initial);
+ d = c;
+ }
+ for (State p : b.getAcceptStates())
+ p.addEpsilon(d.initial);
+ b.deterministic = false;
+ b.clearHashCode();
+ b.checkMinimizeAlways();
+ }
+ return b;
+ }
+
+ /**
+ * Returns a (deterministic) automaton that accepts the complement of the
+ * language of the given automaton.
+ *
+ * Complexity: linear in number of states (if already deterministic).
+ */
+ static public Automaton complement(Automaton a) {
+ a = a.cloneExpandedIfRequired();
+ a.determinize();
+ a.totalize();
+ for (State p : a.getStates())
+ p.accept = !p.accept;
+ a.removeDeadTransitions();
+ return a;
+ }
+
+ /**
+ * Returns a (deterministic) automaton that accepts the intersection of the
+ * language of a1 and the complement of the language of
+ * a2. As a side-effect, the automata may be determinized, if not
+ * already deterministic.
+ *
+ * Complexity: quadratic in number of states (if already deterministic).
+ */
+ static public Automaton minus(Automaton a1, Automaton a2) {
+ if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata
+ .makeEmpty();
+ if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired();
+ if (a1.isSingleton()) {
+ if (BasicOperations.run(a2, a1.singleton)) return BasicAutomata.makeEmpty();
+ else return a1.cloneIfRequired();
+ }
+ return intersection(a1, a2.complement());
+ }
+
+ /**
+ * Returns an automaton that accepts the intersection of the languages of the
+ * given automata. Never modifies the input automata languages.
+ *
+ * Complexity: quadratic in number of states.
+ */
+ static public Automaton intersection(Automaton a1, Automaton a2) {
+ if (a1.isSingleton()) {
+ if (BasicOperations.run(a2, a1.singleton)) return a1.cloneIfRequired();
+ else return BasicAutomata.makeEmpty();
+ }
+ if (a2.isSingleton()) {
+ if (BasicOperations.run(a1, a2.singleton)) return a2.cloneIfRequired();
+ else return BasicAutomata.makeEmpty();
+ }
+ if (a1 == a2) return a1.cloneIfRequired();
+ Transition[][] transitions1 = Automaton
+ .getSortedTransitions(a1.getStates());
+ Transition[][] transitions2 = Automaton
+ .getSortedTransitions(a2.getStates());
+ Automaton c = new Automaton();
+ LinkedList worklist = new LinkedList();
+ HashMap newstates = new HashMap();
+ StatePair p = new StatePair(c.initial, a1.initial, a2.initial);
+ worklist.add(p);
+ newstates.put(p, p);
+ while (worklist.size() > 0) {
+ p = worklist.removeFirst();
+ p.s.accept = p.s1.accept && p.s2.accept;
+ Transition[] t1 = transitions1[p.s1.number];
+ Transition[] t2 = transitions2[p.s2.number];
+ for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
+ while (b2 < t2.length && t2[b2].max < t1[n1].min)
+ b2++;
+ for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++)
+ if (t2[n2].max >= t1[n1].min) {
+ StatePair q = new StatePair(t1[n1].to, t2[n2].to);
+ StatePair r = newstates.get(q);
+ if (r == null) {
+ q.s = new State();
+ worklist.add(q);
+ newstates.put(q, q);
+ r = q;
+ }
+ char min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min;
+ char max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max;
+ p.s.transitions.add(new Transition(min, max, r.s));
+ }
+ }
+ }
+ c.deterministic = a1.deterministic && a2.deterministic;
+ c.removeDeadTransitions();
+ c.checkMinimizeAlways();
+ return c;
+ }
+
+ /**
+ * Returns true if the language of a1 is a subset of the language
+ * of a2. As a side-effect, a2 is determinized if
+ * not already marked as deterministic.
+ *
+ * Complexity: quadratic in number of states.
+ */
+ public static boolean subsetOf(Automaton a1, Automaton a2) {
+ if (a1 == a2) return true;
+ if (a1.isSingleton()) {
+ if (a2.isSingleton()) return a1.singleton.equals(a2.singleton);
+ return BasicOperations.run(a2, a1.singleton);
+ }
+ a2.determinize();
+ Transition[][] transitions1 = Automaton
+ .getSortedTransitions(a1.getStates());
+ Transition[][] transitions2 = Automaton
+ .getSortedTransitions(a2.getStates());
+ LinkedList worklist = new LinkedList();
+ HashSet visited = new HashSet();
+ StatePair p = new StatePair(a1.initial, a2.initial);
+ worklist.add(p);
+ visited.add(p);
+ while (worklist.size() > 0) {
+ p = worklist.removeFirst();
+ if (p.s1.accept && !p.s2.accept) return false;
+ Transition[] t1 = transitions1[p.s1.number];
+ Transition[] t2 = transitions2[p.s2.number];
+ for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
+ while (b2 < t2.length && t2[b2].max < t1[n1].min)
+ b2++;
+ int min1 = t1[n1].min, max1 = t1[n1].max;
+ for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) {
+ if (t2[n2].min > min1) return false;
+ if (t2[n2].max < Character.MAX_VALUE) min1 = t2[n2].max + 1;
+ else {
+ min1 = Character.MAX_VALUE;
+ max1 = Character.MIN_VALUE;
+ }
+ StatePair q = new StatePair(t1[n1].to, t2[n2].to);
+ if (!visited.contains(q)) {
+ worklist.add(q);
+ visited.add(q);
+ }
+ }
+ if (min1 <= max1) return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Returns an automaton that accepts the union of the languages of the given
+ * automata.
+ *
+ * Complexity: linear in number of states.
+ */
+ public static Automaton union(Automaton a1, Automaton a2) {
+ if ((a1.isSingleton() && a2.isSingleton() && a1.singleton
+ .equals(a2.singleton))
+ || a1 == a2) return a1.cloneIfRequired();
+ if (a1 == a2) {
+ a1 = a1.cloneExpanded();
+ a2 = a2.cloneExpanded();
+ } else {
+ a1 = a1.cloneExpandedIfRequired();
+ a2 = a2.cloneExpandedIfRequired();
+ }
+ State s = new State();
+ s.addEpsilon(a1.initial);
+ s.addEpsilon(a2.initial);
+ a1.initial = s;
+ a1.deterministic = false;
+ a1.clearHashCode();
+ a1.checkMinimizeAlways();
+ return a1;
+ }
+
+ /**
+ * Returns an automaton that accepts the union of the languages of the given
+ * automata.
+ *
+ * Complexity: linear in number of states.
+ */
+ public static Automaton union(Collection l) {
+ Set ids = new HashSet();
+ for (Automaton a : l)
+ ids.add(System.identityHashCode(a));
+ boolean has_aliases = ids.size() != l.size();
+ State s = new State();
+ for (Automaton b : l) {
+ if (BasicOperations.isEmpty(b)) continue;
+ Automaton bb = b;
+ if (has_aliases) bb = bb.cloneExpanded();
+ else bb = bb.cloneExpandedIfRequired();
+ s.addEpsilon(bb.initial);
+ }
+ Automaton a = new Automaton();
+ a.initial = s;
+ a.deterministic = false;
+ a.clearHashCode();
+ a.checkMinimizeAlways();
+ return a;
+ }
+
+ /**
+ * Determinizes the given automaton.
+ *
+ * Complexity: exponential in number of states.
+ */
+ public static void determinize(Automaton a) {
+ if (a.deterministic || a.isSingleton()) return;
+ Set initialset = new HashSet();
+ initialset.add(a.initial);
+ determinize(a, initialset);
+ }
+
+ /**
+ * Determinizes the given automaton using the given set of initial states.
+ */
+ static void determinize(Automaton a, Set initialset) {
+ char[] points = a.getStartPoints();
+ // subset construction
+ Map,Set> sets = new HashMap,Set>();
+ LinkedList> worklist = new LinkedList>();
+ Map,State> newstate = new HashMap,State>();
+ sets.put(initialset, initialset);
+ worklist.add(initialset);
+ a.initial = new State();
+ newstate.put(initialset, a.initial);
+ while (worklist.size() > 0) {
+ Set s = worklist.removeFirst();
+ State r = newstate.get(s);
+ for (State q : s)
+ if (q.accept) {
+ r.accept = true;
+ break;
+ }
+ for (int n = 0; n < points.length; n++) {
+ Set p = new HashSet();
+ for (State q : s)
+ for (Transition t : q.transitions)
+ if (t.min <= points[n] && points[n] <= t.max) p.add(t.to);
+ if (!sets.containsKey(p)) {
+ sets.put(p, p);
+ worklist.add(p);
+ newstate.put(p, new State());
+ }
+ State q = newstate.get(p);
+ char min = points[n];
+ char max;
+ if (n + 1 < points.length) max = (char) (points[n + 1] - 1);
+ else max = Character.MAX_VALUE;
+ r.transitions.add(new Transition(min, max, q));
+ }
+ }
+ a.deterministic = true;
+ a.removeDeadTransitions();
+ }
+
+ /**
+ * Adds epsilon transitions to the given automaton. This method adds extra
+ * character interval transitions that are equivalent to the given set of
+ * epsilon transitions.
+ *
+ * @param pairs collection of {@link StatePair} objects representing pairs of
+ * source/destination states where epsilon transitions should be
+ * added
+ */
+ public static void addEpsilons(Automaton a, Collection pairs) {
+ a.expandSingleton();
+ HashMap> forward = new HashMap>();
+ HashMap> back = new HashMap>();
+ for (StatePair p : pairs) {
+ HashSet to = forward.get(p.s1);
+ if (to == null) {
+ to = new HashSet();
+ forward.put(p.s1, to);
+ }
+ to.add(p.s2);
+ HashSet from = back.get(p.s2);
+ if (from == null) {
+ from = new HashSet();
+ back.put(p.s2, from);
+ }
+ from.add(p.s1);
+ }
+ // calculate epsilon closure
+ LinkedList worklist = new LinkedList(pairs);
+ HashSet workset = new HashSet(pairs);
+ while (!worklist.isEmpty()) {
+ StatePair p = worklist.removeFirst();
+ workset.remove(p);
+ HashSet to = forward.get(p.s2);
+ HashSet from = back.get(p.s1);
+ if (to != null) {
+ for (State s : to) {
+ StatePair pp = new StatePair(p.s1, s);
+ if (!pairs.contains(pp)) {
+ pairs.add(pp);
+ forward.get(p.s1).add(s);
+ back.get(s).add(p.s1);
+ worklist.add(pp);
+ workset.add(pp);
+ if (from != null) {
+ for (State q : from) {
+ StatePair qq = new StatePair(q, p.s1);
+ if (!workset.contains(qq)) {
+ worklist.add(qq);
+ workset.add(qq);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ // add transitions
+ for (StatePair p : pairs)
+ p.s1.addEpsilon(p.s2);
+ a.deterministic = false;
+ a.clearHashCode();
+ a.checkMinimizeAlways();
+ }
+
+ /**
+ * Returns true if the given automaton accepts the empty string and nothing
+ * else.
+ */
+ public static boolean isEmptyString(Automaton a) {
+ if (a.isSingleton()) return a.singleton.length() == 0;
+ else return a.initial.accept && a.initial.transitions.isEmpty();
+ }
+
+ /**
+ * Returns true if the given automaton accepts no strings.
+ */
+ public static boolean isEmpty(Automaton a) {
+ if (a.isSingleton()) return false;
+ return !a.initial.accept && a.initial.transitions.isEmpty();
+ }
+
+ /**
+ * Returns true if the given automaton accepts all strings.
+ */
+ public static boolean isTotal(Automaton a) {
+ if (a.isSingleton()) return false;
+ if (a.initial.accept && a.initial.transitions.size() == 1) {
+ Transition t = a.initial.transitions.iterator().next();
+ return t.to == a.initial && t.min == Character.MIN_VALUE
+ && t.max == Character.MAX_VALUE;
+ }
+ return false;
+ }
+
+ /**
+ * Returns true if the given string is accepted by the automaton.
+ *
+ * Complexity: linear in the length of the string.
+ *
+ * Note: for full performance, use the {@link RunAutomaton} class.
+ */
+ public static boolean run(Automaton a, String s) {
+ if (a.isSingleton()) return s.equals(a.singleton);
+ if (a.deterministic) {
+ State p = a.initial;
+ for (int i = 0; i < s.length(); i++) {
+ State q = p.step(s.charAt(i));
+ if (q == null) return false;
+ p = q;
+ }
+ return p.accept;
+ } else {
+ Set states = a.getStates();
+ Automaton.setStateNumbers(states);
+ LinkedList pp = new LinkedList();
+ LinkedList pp_other = new LinkedList();
+ BitSet bb = new BitSet(states.size());
+ BitSet bb_other = new BitSet(states.size());
+ pp.add(a.initial);
+ ArrayList dest = new ArrayList();
+ boolean accept = a.initial.accept;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ accept = false;
+ pp_other.clear();
+ bb_other.clear();
+ for (State p : pp) {
+ dest.clear();
+ p.step(c, dest);
+ for (State q : dest) {
+ if (q.accept) accept = true;
+ if (!bb_other.get(q.number)) {
+ bb_other.set(q.number);
+ pp_other.add(q);
+ }
+ }
+ }
+ LinkedList tp = pp;
+ pp = pp_other;
+ pp_other = tp;
+ BitSet tb = bb;
+ bb = bb_other;
+ bb_other = tb;
+ }
+ return accept;
+ }
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\BasicOperations.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (revision 0)
@@ -0,0 +1,278 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.Set;
+
+/**
+ * Operations for minimizing automata.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class MinimizationOperations {
+
+ private MinimizationOperations() {}
+
+ /**
+ * Minimizes (and determinizes if not already deterministic) the given
+ * automaton.
+ *
+ * @see Automaton#setMinimization(int)
+ */
+ public static void minimize(Automaton a) {
+ if (!a.isSingleton()) {
+ minimizeHopcroft(a);
+ }
+ // recompute hash code
+ a.hash_code = a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2;
+ if (a.hash_code == 0) a.hash_code = 1;
+ }
+
+ private static void initialize(ArrayList list, int size) {
+ for (int i = 0; i < size; i++)
+ list.add(null);
+ }
+
+ /**
+ * Minimizes the given automaton using Hopcroft's algorithm.
+ */
+ public static void minimizeHopcroft(Automaton a) {
+ a.determinize();
+ Set tr = a.initial.getTransitions();
+ if (tr.size() == 1) {
+ Transition t = tr.iterator().next();
+ if (t.to == a.initial && t.min == Character.MIN_VALUE
+ && t.max == Character.MAX_VALUE) return;
+ }
+ a.totalize();
+ // make arrays for numbered states and effective alphabet
+ Set ss = a.getStates();
+ State[] states = new State[ss.size()];
+ int number = 0;
+ for (State q : ss) {
+ states[number] = q;
+ q.number = number++;
+ }
+ char[] sigma = a.getStartPoints();
+ // initialize data structures
+ ArrayList>> reverse = new ArrayList>>();
+ for (int q = 0; q < states.length; q++) {
+ ArrayList> v = new ArrayList>();
+ initialize(v, sigma.length);
+ reverse.add(v);
+ }
+ boolean[][] reverse_nonempty = new boolean[states.length][sigma.length];
+ ArrayList> partition = new ArrayList>();
+ initialize(partition, states.length);
+ int[] block = new int[states.length];
+ StateList[][] active = new StateList[states.length][sigma.length];
+ StateListNode[][] active2 = new StateListNode[states.length][sigma.length];
+ LinkedList pending = new LinkedList();
+ boolean[][] pending2 = new boolean[sigma.length][states.length];
+ ArrayList split = new ArrayList();
+ boolean[] split2 = new boolean[states.length];
+ ArrayList refine = new ArrayList();
+ boolean[] refine2 = new boolean[states.length];
+ ArrayList> splitblock = new ArrayList>();
+ initialize(splitblock, states.length);
+ for (int q = 0; q < states.length; q++) {
+ splitblock.set(q, new ArrayList());
+ partition.set(q, new LinkedList());
+ for (int x = 0; x < sigma.length; x++) {
+ reverse.get(q).set(x, new LinkedList());
+ active[q][x] = new StateList();
+ }
+ }
+ // find initial partition and reverse edges
+ for (int q = 0; q < states.length; q++) {
+ State qq = states[q];
+ int j;
+ if (qq.accept) j = 0;
+ else j = 1;
+ partition.get(j).add(qq);
+ block[qq.number] = j;
+ for (int x = 0; x < sigma.length; x++) {
+ char y = sigma[x];
+ State p = qq.step(y);
+ reverse.get(p.number).get(x).add(qq);
+ reverse_nonempty[p.number][x] = true;
+ }
+ }
+ // initialize active sets
+ for (int j = 0; j <= 1; j++)
+ for (int x = 0; x < sigma.length; x++)
+ for (State qq : partition.get(j))
+ if (reverse_nonempty[qq.number][x]) active2[qq.number][x] = active[j][x]
+ .add(qq);
+ // initialize pending
+ for (int x = 0; x < sigma.length; x++) {
+ int a0 = active[0][x].size;
+ int a1 = active[1][x].size;
+ int j;
+ if (a0 <= a1) j = 0;
+ else j = 1;
+ pending.add(new IntPair(j, x));
+ pending2[x][j] = true;
+ }
+ // process pending until fixed point
+ int k = 2;
+ while (!pending.isEmpty()) {
+ IntPair ip = pending.removeFirst();
+ int p = ip.n1;
+ int x = ip.n2;
+ pending2[x][p] = false;
+ // find states that need to be split off their blocks
+ for (StateListNode m = active[p][x].first; m != null; m = m.next)
+ for (State s : reverse.get(m.q.number).get(x))
+ if (!split2[s.number]) {
+ split2[s.number] = true;
+ split.add(s);
+ int j = block[s.number];
+ splitblock.get(j).add(s);
+ if (!refine2[j]) {
+ refine2[j] = true;
+ refine.add(j);
+ }
+ }
+ // refine blocks
+ for (int j : refine) {
+ if (splitblock.get(j).size() < partition.get(j).size()) {
+ LinkedList b1 = partition.get(j);
+ LinkedList b2 = partition.get(k);
+ for (State s : splitblock.get(j)) {
+ b1.remove(s);
+ b2.add(s);
+ block[s.number] = k;
+ for (int c = 0; c < sigma.length; c++) {
+ StateListNode sn = active2[s.number][c];
+ if (sn != null && sn.sl == active[j][c]) {
+ sn.remove();
+ active2[s.number][c] = active[k][c].add(s);
+ }
+ }
+ }
+ // update pending
+ for (int c = 0; c < sigma.length; c++) {
+ int aj = active[j][c].size;
+ int ak = active[k][c].size;
+ if (!pending2[c][j] && 0 < aj && aj <= ak) {
+ pending2[c][j] = true;
+ pending.add(new IntPair(j, c));
+ } else {
+ pending2[c][k] = true;
+ pending.add(new IntPair(k, c));
+ }
+ }
+ k++;
+ }
+ for (State s : splitblock.get(j))
+ split2[s.number] = false;
+ refine2[j] = false;
+ splitblock.get(j).clear();
+ }
+ split.clear();
+ refine.clear();
+ }
+ // make a new state for each equivalence class, set initial state
+ State[] newstates = new State[k];
+ for (int n = 0; n < newstates.length; n++) {
+ State s = new State();
+ newstates[n] = s;
+ for (State q : partition.get(n)) {
+ if (q == a.initial) a.initial = s;
+ s.accept = q.accept;
+ s.number = q.number; // select representative
+ q.number = n;
+ }
+ }
+ // build transitions and set acceptance
+ for (int n = 0; n < newstates.length; n++) {
+ State s = newstates[n];
+ s.accept = states[s.number].accept;
+ for (Transition t : states[s.number].transitions)
+ s.transitions.add(new Transition(t.min, t.max, newstates[t.to.number]));
+ }
+ a.removeDeadTransitions();
+ }
+
+ static class IntPair {
+
+ int n1, n2;
+
+ IntPair(int n1, int n2) {
+ this.n1 = n1;
+ this.n2 = n2;
+ }
+ }
+
+ static class StateList {
+
+ int size;
+
+ StateListNode first, last;
+
+ StateListNode add(State q) {
+ return new StateListNode(q, this);
+ }
+ }
+
+ static class StateListNode {
+
+ State q;
+
+ StateListNode next, prev;
+
+ StateList sl;
+
+ StateListNode(State q, StateList sl) {
+ this.q = q;
+ this.sl = sl;
+ if (sl.size++ == 0) sl.first = sl.last = this;
+ else {
+ sl.last.next = this;
+ prev = sl.last;
+ sl.last = this;
+ }
+ }
+
+ void remove() {
+ sl.size--;
+ if (sl.first == this) sl.first = next;
+ else prev.next = next;
+ if (sl.last == this) sl.last = prev;
+ else next.prev = prev;
+ }
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\MinimizationOperations.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/package.html
===================================================================
--- src/java/org/apache/lucene/util/automaton/package.html (revision 0)
+++ src/java/org/apache/lucene/util/automaton/package.html (revision 0)
@@ -0,0 +1,50 @@
+
+
+
+
+Finite-state automaton for regular expressions.
+
+This package contains a full DFA/NFA implementation with Unicode
+alphabet and support for all standard (and a number of non-standard)
+regular expression operations.
+
+The most commonly used functionality is located in the classes
+{@link org.apache.lucene.util.automaton.Automaton} and
+{@link org.apache.lucene.util.automaton.RegExp}.
+
+For more information, go to the package home page at
+http://www.brics.dk/automaton/.
+
+WARNING: The status of the Automaton feature is experimental.
+The APIs introduced here might change in the future and will not be
+supported anymore in such a case.
+
+
Property changes on: src\java\org\apache\lucene\util\automaton\package.html
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/RegExp.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/RegExp.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/RegExp.java (revision 0)
@@ -0,0 +1,1003 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Regular Expression extension to Automaton.
+ *
+ * Regular expressions are built from the following abstract syntax:
+ *
+ *
+ *
+ * | regexp |
+ * ::= |
+ * unionexp |
+ * |
+ * |
+ *
+ *
+ * |
+ * | |
+ * |
+ * |
+ * |
+ *
+ *
+ *
+ * | unionexp |
+ * ::= |
+ * interexp | unionexp |
+ * (union) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * interexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | interexp |
+ * ::= |
+ * concatexp & interexp |
+ * (intersection) |
+ * [OPTIONAL] |
+ *
+ *
+ * |
+ * | |
+ * concatexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | concatexp |
+ * ::= |
+ * repeatexp concatexp |
+ * (concatenation) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | repeatexp |
+ * ::= |
+ * repeatexp ? |
+ * (zero or one occurrence) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp * |
+ * (zero or more occurrences) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp + |
+ * (one or more occurrences) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp {n} |
+ * (n occurrences) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp {n,} |
+ * (n or more occurrences) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp {n,m} |
+ * (n to m occurrences, including both) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * complexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | complexp |
+ * ::= |
+ * ~ complexp |
+ * (complement) |
+ * [OPTIONAL] |
+ *
+ *
+ * |
+ * | |
+ * charclassexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | charclassexp |
+ * ::= |
+ * [ charclasses ] |
+ * (character class) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * [^ charclasses ] |
+ * (negated character class) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * simpleexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | charclasses |
+ * ::= |
+ * charclass charclasses |
+ * |
+ * |
+ *
+ *
+ * |
+ * | |
+ * charclass |
+ * |
+ * |
+ *
+ *
+ *
+ * | charclass |
+ * ::= |
+ * charexp - charexp |
+ * (character range, including end-points) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * charexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | simpleexp |
+ * ::= |
+ * charexp |
+ * |
+ * |
+ *
+ *
+ * |
+ * | |
+ * . |
+ * (any single character) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * # |
+ * (the empty language) |
+ * [OPTIONAL] |
+ *
+ *
+ * |
+ * | |
+ * @ |
+ * (any string) |
+ * [OPTIONAL] |
+ *
+ *
+ * |
+ * | |
+ * " <Unicode string without double-quotes> " |
+ * (a string) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * ( ) |
+ * (the empty string) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * ( unionexp ) |
+ * (precedence override) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * < <identifier> > |
+ * (named automaton) |
+ * [OPTIONAL] |
+ *
+ *
+ * |
+ * | |
+ * <n-m> |
+ * (numerical interval) |
+ * [OPTIONAL] |
+ *
+ *
+ *
+ * | charexp |
+ * ::= |
+ * <Unicode character> |
+ * (a single non-reserved character) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * \ <Unicode character> |
+ * (a single character) |
+ * |
+ *
+ *
+ *
+ * The productions marked [OPTIONAL] are only allowed if
+ * specified by the syntax flags passed to the RegExp constructor.
+ * The reserved characters used in the (enabled) syntax must be escaped with
+ * backslash (\) or double-quotes ("..."). (In
+ * contrast to other regexp syntaxes, this is required also in character
+ * classes.) Be aware that dash (-) has a special meaning in
+ * charclass expressions. An identifier is a string not containing right
+ * angle bracket (>) or dash (-). Numerical
+ * intervals are specified by non-negative decimal integers and include both end
+ * points, and if n and m have the same number
+ * of digits, then the conforming strings must have that length (i.e. prefixed
+ * by 0's).
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class RegExp {
+
+ enum Kind {
+ REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL
+ }
+
+ /**
+ * Syntax flag, enables intersection (&).
+ */
+ public static final int INTERSECTION = 0x0001;
+
+ /**
+ * Syntax flag, enables complement (~).
+ */
+ public static final int COMPLEMENT = 0x0002;
+
+ /**
+ * Syntax flag, enables empty language (#).
+ */
+ public static final int EMPTY = 0x0004;
+
+ /**
+ * Syntax flag, enables anystring (@).
+ */
+ public static final int ANYSTRING = 0x0008;
+
+ /**
+ * Syntax flag, enables named automata (<identifier>).
+ */
+ public static final int AUTOMATON = 0x0010;
+
+ /**
+ * Syntax flag, enables numerical intervals (
+ * <n-m>).
+ */
+ public static final int INTERVAL = 0x0020;
+
+ /**
+ * Syntax flag, enables all optional regexp syntax.
+ */
+ public static final int ALL = 0xffff;
+
+ /**
+ * Syntax flag, enables no optional regexp syntax.
+ */
+ public static final int NONE = 0x0000;
+
+ private static boolean allow_mutation = false;
+
+ Kind kind;
+ RegExp exp1, exp2;
+ String s;
+ char c;
+ int min, max, digits;
+ char from, to;
+
+ String b;
+ int flags;
+ int pos;
+
+ RegExp() {}
+
+ /**
+ * Constructs new RegExp from a string. Same as
+ * RegExp(s, ALL).
+ *
+ * @param s regexp string
+ * @exception IllegalArgumentException if an error occured while parsing the
+ * regular expression
+ */
+ public RegExp(String s) throws IllegalArgumentException {
+ this(s, ALL);
+ }
+
+ /**
+ * Constructs new RegExp from a string.
+ *
+ * @param s regexp string
+ * @param syntax_flags boolean 'or' of optional syntax constructs to be
+ * enabled
+ * @exception IllegalArgumentException if an error occured while parsing the
+ * regular expression
+ */
+ public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
+ b = s;
+ flags = syntax_flags;
+ RegExp e;
+ if (s.length() == 0) e = makeString("");
+ else {
+ e = parseUnionExp();
+ if (pos < b.length()) throw new IllegalArgumentException(
+ "end-of-string expected at position " + pos);
+ }
+ kind = e.kind;
+ exp1 = e.exp1;
+ exp2 = e.exp2;
+ this.s = e.s;
+ c = e.c;
+ min = e.min;
+ max = e.max;
+ digits = e.digits;
+ from = e.from;
+ to = e.to;
+ b = null;
+ }
+
+ /**
+ * Constructs new Automaton from this RegExp. Same
+ * as toAutomaton(null) (empty automaton map).
+ */
+ public Automaton toAutomaton() {
+ return toAutomatonAllowMutate(null, null);
+ }
+
+ /**
+ * Constructs new Automaton from this RegExp. The
+ * constructed automaton is minimal and deterministic and has no transitions
+ * to dead states.
+ *
+ * @param automaton_provider provider of automata for named identifiers
+ * @exception IllegalArgumentException if this regular expression uses a named
+ * identifier that is not available from the automaton provider
+ */
+ public Automaton toAutomaton(AutomatonProvider automaton_provider)
+ throws IllegalArgumentException {
+ return toAutomatonAllowMutate(null, automaton_provider);
+ }
+
+ /**
+ * Constructs new Automaton from this RegExp. The
+ * constructed automaton is minimal and deterministic and has no transitions
+ * to dead states.
+ *
+ * @param automata a map from automaton identifiers to automata (of type
+ * Automaton).
+ * @exception IllegalArgumentException if this regular expression uses a named
+ * identifier that does not occur in the automaton map
+ */
+ public Automaton toAutomaton(Map automata)
+ throws IllegalArgumentException {
+ return toAutomatonAllowMutate(automata, null);
+ }
+
+ /**
+ * Sets or resets allow mutate flag. If this flag is set, then automata
+ * construction uses mutable automata, which is slightly faster but not thread
+ * safe. By default, the flag is not set.
+ *
+ * @param flag if true, the flag is set
+ * @return previous value of the flag
+ */
+ public boolean setAllowMutate(boolean flag) {
+ boolean b = allow_mutation;
+ allow_mutation = flag;
+ return b;
+ }
+
+ private Automaton toAutomatonAllowMutate(Map automata,
+ AutomatonProvider automaton_provider) throws IllegalArgumentException {
+ boolean b = false;
+ if (allow_mutation) b = Automaton.setAllowMutate(true); // thread unsafe
+ Automaton a = toAutomaton(automata, automaton_provider);
+ if (allow_mutation) Automaton.setAllowMutate(b);
+ return a;
+ }
+
+ private Automaton toAutomaton(Map automata,
+ AutomatonProvider automaton_provider) throws IllegalArgumentException {
+ List list;
+ Automaton a = null;
+ switch (kind) {
+ case REGEXP_UNION:
+ list = new ArrayList();
+ findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider);
+ findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider);
+ a = BasicOperations.union(list);
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_CONCATENATION:
+ list = new ArrayList();
+ findLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata,
+ automaton_provider);
+ findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata,
+ automaton_provider);
+ a = BasicOperations.concatenate(list);
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_INTERSECTION:
+ a = exp1.toAutomaton(automata, automaton_provider).intersection(
+ exp2.toAutomaton(automata, automaton_provider));
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_OPTIONAL:
+ a = exp1.toAutomaton(automata, automaton_provider).optional();
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_REPEAT:
+ a = exp1.toAutomaton(automata, automaton_provider).repeat();
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_REPEAT_MIN:
+ a = exp1.toAutomaton(automata, automaton_provider).repeat(min);
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_REPEAT_MINMAX:
+ a = exp1.toAutomaton(automata, automaton_provider).repeat(min, max);
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_COMPLEMENT:
+ a = exp1.toAutomaton(automata, automaton_provider).complement();
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_CHAR:
+ a = BasicAutomata.makeChar(c);
+ break;
+ case REGEXP_CHAR_RANGE:
+ a = BasicAutomata.makeCharRange(from, to);
+ break;
+ case REGEXP_ANYCHAR:
+ a = BasicAutomata.makeAnyChar();
+ break;
+ case REGEXP_EMPTY:
+ a = BasicAutomata.makeEmpty();
+ break;
+ case REGEXP_STRING:
+ a = BasicAutomata.makeString(s);
+ break;
+ case REGEXP_ANYSTRING:
+ a = BasicAutomata.makeAnyString();
+ break;
+ case REGEXP_AUTOMATON:
+ Automaton aa = null;
+ if (automata != null) aa = automata.get(s);
+ if (aa == null && automaton_provider != null) try {
+ aa = automaton_provider.getAutomaton(s);
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ if (aa == null) throw new IllegalArgumentException("'" + s
+ + "' not found");
+ a = aa.clone(); // always clone here (ignore allow_mutate)
+ break;
+ case REGEXP_INTERVAL:
+ a = BasicAutomata.makeInterval(min, max, digits);
+ break;
+ }
+ return a;
+ }
+
+ private void findLeaves(RegExp exp, Kind kind, List list,
+ Map automata, AutomatonProvider automaton_provider) {
+ if (exp.kind == kind) {
+ findLeaves(exp.exp1, kind, list, automata, automaton_provider);
+ findLeaves(exp.exp2, kind, list, automata, automaton_provider);
+ } else list.add(exp.toAutomaton(automata, automaton_provider));
+ }
+
+ /**
+ * Constructs string from parsed regular expression.
+ */
+ @Override
+ public String toString() {
+ return toStringBuilder(new StringBuilder()).toString();
+ }
+
+ StringBuilder toStringBuilder(StringBuilder b) {
+ switch (kind) {
+ case REGEXP_UNION:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append("|");
+ exp2.toStringBuilder(b);
+ b.append(")");
+ break;
+ case REGEXP_CONCATENATION:
+ exp1.toStringBuilder(b);
+ exp2.toStringBuilder(b);
+ break;
+ case REGEXP_INTERSECTION:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append("&");
+ exp2.toStringBuilder(b);
+ b.append(")");
+ break;
+ case REGEXP_OPTIONAL:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append(")?");
+ break;
+ case REGEXP_REPEAT:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append(")*");
+ break;
+ case REGEXP_REPEAT_MIN:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append("){").append(min).append(",}");
+ break;
+ case REGEXP_REPEAT_MINMAX:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append("){").append(min).append(",").append(max).append("}");
+ break;
+ case REGEXP_COMPLEMENT:
+ b.append("~(");
+ exp1.toStringBuilder(b);
+ b.append(")");
+ break;
+ case REGEXP_CHAR:
+ b.append("\\").append(c);
+ break;
+ case REGEXP_CHAR_RANGE:
+ b.append("[\\").append(from).append("-\\").append(to).append("]");
+ break;
+ case REGEXP_ANYCHAR:
+ b.append(".");
+ break;
+ case REGEXP_EMPTY:
+ b.append("#");
+ break;
+ case REGEXP_STRING:
+ b.append("\"").append(s).append("\"");
+ break;
+ case REGEXP_ANYSTRING:
+ b.append("@");
+ break;
+ case REGEXP_AUTOMATON:
+ b.append("<").append(s).append(">");
+ break;
+ case REGEXP_INTERVAL:
+ String s1 = Integer.toString(min);
+ String s2 = Integer.toString(max);
+ b.append("<");
+ if (digits > 0) for (int i = s1.length(); i < digits; i++)
+ b.append('0');
+ b.append(s1).append("-");
+ if (digits > 0) for (int i = s2.length(); i < digits; i++)
+ b.append('0');
+ b.append(s2).append(">");
+ break;
+ }
+ return b;
+ }
+
+ /**
+ * Returns set of automaton identifiers that occur in this regular expression.
+ */
+ public Set getIdentifiers() {
+ HashSet set = new HashSet();
+ getIdentifiers(set);
+ return set;
+ }
+
+ void getIdentifiers(Set set) {
+ switch (kind) {
+ case REGEXP_UNION:
+ case REGEXP_CONCATENATION:
+ case REGEXP_INTERSECTION:
+ exp1.getIdentifiers(set);
+ exp2.getIdentifiers(set);
+ break;
+ case REGEXP_OPTIONAL:
+ case REGEXP_REPEAT:
+ case REGEXP_REPEAT_MIN:
+ case REGEXP_REPEAT_MINMAX:
+ case REGEXP_COMPLEMENT:
+ exp1.getIdentifiers(set);
+ break;
+ case REGEXP_AUTOMATON:
+ set.add(s);
+ break;
+ default:
+ }
+ }
+
+ static RegExp makeUnion(RegExp exp1, RegExp exp2) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_UNION;
+ r.exp1 = exp1;
+ r.exp2 = exp2;
+ return r;
+ }
+
+ static RegExp makeConcatenation(RegExp exp1, RegExp exp2) {
+ if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
+ && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString(
+ exp1, exp2);
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_CONCATENATION;
+ if (exp1.kind == Kind.REGEXP_CONCATENATION
+ && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING)
+ && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) {
+ r.exp1 = exp1.exp1;
+ r.exp2 = makeString(exp1.exp2, exp2);
+ } else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
+ && exp2.kind == Kind.REGEXP_CONCATENATION
+ && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) {
+ r.exp1 = makeString(exp1, exp2.exp1);
+ r.exp2 = exp2.exp2;
+ } else {
+ r.exp1 = exp1;
+ r.exp2 = exp2;
+ }
+ return r;
+ }
+
+ static private RegExp makeString(RegExp exp1, RegExp exp2) {
+ StringBuilder b = new StringBuilder();
+ if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s);
+ else b.append(exp1.c);
+ if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s);
+ else b.append(exp2.c);
+ return makeString(b.toString());
+ }
+
+ static RegExp makeIntersection(RegExp exp1, RegExp exp2) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_INTERSECTION;
+ r.exp1 = exp1;
+ r.exp2 = exp2;
+ return r;
+ }
+
+ static RegExp makeOptional(RegExp exp) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_OPTIONAL;
+ r.exp1 = exp;
+ return r;
+ }
+
+ static RegExp makeRepeat(RegExp exp) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_REPEAT;
+ r.exp1 = exp;
+ return r;
+ }
+
+ static RegExp makeRepeat(RegExp exp, int min) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_REPEAT_MIN;
+ r.exp1 = exp;
+ r.min = min;
+ return r;
+ }
+
+ static RegExp makeRepeat(RegExp exp, int min, int max) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_REPEAT_MINMAX;
+ r.exp1 = exp;
+ r.min = min;
+ r.max = max;
+ return r;
+ }
+
+ static RegExp makeComplement(RegExp exp) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_COMPLEMENT;
+ r.exp1 = exp;
+ return r;
+ }
+
+ static RegExp makeChar(char c) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_CHAR;
+ r.c = c;
+ return r;
+ }
+
+ static RegExp makeCharRange(char from, char to) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_CHAR_RANGE;
+ r.from = from;
+ r.to = to;
+ return r;
+ }
+
+ static RegExp makeAnyChar() {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_ANYCHAR;
+ return r;
+ }
+
+ static RegExp makeEmpty() {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_EMPTY;
+ return r;
+ }
+
+ static RegExp makeString(String s) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_STRING;
+ r.s = s;
+ return r;
+ }
+
+ static RegExp makeAnyString() {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_ANYSTRING;
+ return r;
+ }
+
+ static RegExp makeAutomaton(String s) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_AUTOMATON;
+ r.s = s;
+ return r;
+ }
+
+ static RegExp makeInterval(int min, int max, int digits) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_INTERVAL;
+ r.min = min;
+ r.max = max;
+ r.digits = digits;
+ return r;
+ }
+
+ private boolean peek(String s) {
+ return more() && s.indexOf(b.charAt(pos)) != -1;
+ }
+
+ private boolean match(char c) {
+ if (pos >= b.length()) return false;
+ if (b.charAt(pos) == c) {
+ pos++;
+ return true;
+ }
+ return false;
+ }
+
+ private boolean more() {
+ return pos < b.length();
+ }
+
+ private char next() throws IllegalArgumentException {
+ if (!more()) throw new IllegalArgumentException("unexpected end-of-string");
+ return b.charAt(pos++);
+ }
+
+ private boolean check(int flag) {
+ return (flags & flag) != 0;
+ }
+
+ final RegExp parseUnionExp() throws IllegalArgumentException {
+ RegExp e = parseInterExp();
+ if (match('|')) e = makeUnion(e, parseUnionExp());
+ return e;
+ }
+
+ final RegExp parseInterExp() throws IllegalArgumentException {
+ RegExp e = parseConcatExp();
+ if (check(INTERSECTION) && match('&')) e = makeIntersection(e,
+ parseInterExp());
+ return e;
+ }
+
+ final RegExp parseConcatExp() throws IllegalArgumentException {
+ RegExp e = parseRepeatExp();
+ if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation(
+ e, parseConcatExp());
+ return e;
+ }
+
+ final RegExp parseRepeatExp() throws IllegalArgumentException {
+ RegExp e = parseComplExp();
+ while (peek("?*+{")) {
+ if (match('?')) e = makeOptional(e);
+ else if (match('*')) e = makeRepeat(e);
+ else if (match('+')) e = makeRepeat(e, 1);
+ else if (match('{')) {
+ int start = pos;
+ while (peek("0123456789"))
+ next();
+ if (start == pos) throw new IllegalArgumentException(
+ "integer expected at position " + pos);
+ int n = Integer.parseInt(b.substring(start, pos));
+ int m = -1;
+ if (match(',')) {
+ start = pos;
+ while (peek("0123456789"))
+ next();
+ if (start != pos) m = Integer.parseInt(b.substring(start, pos));
+ } else m = n;
+ if (!match('}')) throw new IllegalArgumentException(
+ "expected '}' at position " + pos);
+ if (m == -1) e = makeRepeat(e, n);
+ else e = makeRepeat(e, n, m);
+ }
+ }
+ return e;
+ }
+
+ final RegExp parseComplExp() throws IllegalArgumentException {
+ if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp());
+ else return parseCharClassExp();
+ }
+
+ final RegExp parseCharClassExp() throws IllegalArgumentException {
+ if (match('[')) {
+ boolean negate = false;
+ if (match('^')) negate = true;
+ RegExp e = parseCharClasses();
+ if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e));
+ if (!match(']')) throw new IllegalArgumentException(
+ "expected ']' at position " + pos);
+ return e;
+ } else return parseSimpleExp();
+ }
+
+ final RegExp parseCharClasses() throws IllegalArgumentException {
+ RegExp e = parseCharClass();
+ while (more() && !peek("]"))
+ e = makeUnion(e, parseCharClass());
+ return e;
+ }
+
+ final RegExp parseCharClass() throws IllegalArgumentException {
+ char c = parseCharExp();
+ if (match('-')) return makeCharRange(c, parseCharExp());
+ else return makeChar(c);
+ }
+
+ final RegExp parseSimpleExp() throws IllegalArgumentException {
+ if (match('.')) return makeAnyChar();
+ else if (check(EMPTY) && match('#')) return makeEmpty();
+ else if (check(ANYSTRING) && match('@')) return makeAnyString();
+ else if (match('"')) {
+ int start = pos;
+ while (more() && !peek("\""))
+ next();
+ if (!match('"')) throw new IllegalArgumentException(
+ "expected '\"' at position " + pos);
+ return makeString(b.substring(start, pos - 1));
+ } else if (match('(')) {
+ if (match(')')) return makeString("");
+ RegExp e = parseUnionExp();
+ if (!match(')')) throw new IllegalArgumentException(
+ "expected ')' at position " + pos);
+ return e;
+ } else if ((check(AUTOMATON) || check(INTERVAL)) && match('<')) {
+ int start = pos;
+ while (more() && !peek(">"))
+ next();
+ if (!match('>')) throw new IllegalArgumentException(
+ "expected '>' at position " + pos);
+ String s = b.substring(start, pos - 1);
+ int i = s.indexOf('-');
+ if (i == -1) {
+ if (!check(AUTOMATON)) throw new IllegalArgumentException(
+ "interval syntax error at position " + (pos - 1));
+ return makeAutomaton(s);
+ } else {
+ if (!check(INTERVAL)) throw new IllegalArgumentException(
+ "illegal identifier at position " + (pos - 1));
+ try {
+ if (i == 0 || i == s.length() - 1 || i != s.lastIndexOf('-')) throw new NumberFormatException();
+ String smin = s.substring(0, i);
+ String smax = s.substring(i + 1, s.length());
+ int imin = Integer.parseInt(smin);
+ int imax = Integer.parseInt(smax);
+ int digits;
+ if (smin.length() == smax.length()) digits = smin.length();
+ else digits = 0;
+ if (imin > imax) {
+ int t = imin;
+ imin = imax;
+ imax = t;
+ }
+ return makeInterval(imin, imax, digits);
+ } catch (NumberFormatException e) {
+ throw new IllegalArgumentException(
+ "interval syntax error at position " + (pos - 1));
+ }
+ }
+ } else return makeChar(parseCharExp());
+ }
+
+ final char parseCharExp() throws IllegalArgumentException {
+ match('\\');
+ return next();
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\RegExp.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/RunAutomaton.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/RunAutomaton.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/RunAutomaton.java (revision 0)
@@ -0,0 +1,215 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Set;
+
+/**
+ * Finite-state automaton with fast run operation.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public final class RunAutomaton implements Serializable {
+
+ static final long serialVersionUID = 20001;
+
+ final int size;
+ final boolean[] accept;
+ final int initial;
+ final int[] transitions; // delta(state,c) = transitions[state*points.length +
+ // getCharClass(c)]
+ final char[] points; // char interval start points
+ final int[] classmap; // map from char number to class class
+
+ /**
+ * Returns a string representation of this automaton.
+ */
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ b.append("initial state: ").append(initial).append("\n");
+ for (int i = 0; i < size; i++) {
+ b.append("state " + i);
+ if (accept[i]) b.append(" [accept]:\n");
+ else b.append(" [reject]:\n");
+ for (int j = 0; j < points.length; j++) {
+ int k = transitions[i * points.length + j];
+ if (k != -1) {
+ char min = points[j];
+ char max;
+ if (j + 1 < points.length) max = (char) (points[j + 1] - 1);
+ else max = Character.MAX_VALUE;
+ b.append(" ");
+ Transition.appendCharString(min, b);
+ if (min != max) {
+ b.append("-");
+ Transition.appendCharString(max, b);
+ }
+ b.append(" -> ").append(k).append("\n");
+ }
+ }
+ }
+ return b.toString();
+ }
+
+ /**
+ * Returns number of states in automaton.
+ */
+ public int getSize() {
+ return size;
+ }
+
+ /**
+ * Returns acceptance status for given state.
+ */
+ public boolean isAccept(int state) {
+ return accept[state];
+ }
+
+ /**
+ * Returns initial state.
+ */
+ public int getInitialState() {
+ return initial;
+ }
+
+ /**
+ * Returns array of character class interval start points. The array should
+ * not be modified by the caller.
+ */
+ public char[] getCharIntervals() {
+ return points.clone();
+ }
+
+ /**
+ * Gets character class of given char.
+ */
+ int getCharClass(char c) {
+ return SpecialOperations.findIndex(c, points);
+ }
+
+ /**
+ * Constructs a new RunAutomaton from a deterministic
+ * Automaton.
+ *
+ * @param a an automaton
+ */
+ public RunAutomaton(Automaton a) {
+ a.determinize();
+ points = a.getStartPoints();
+ Set states = a.getStates();
+ Automaton.setStateNumbers(states);
+ initial = a.initial.number;
+ size = states.size();
+ accept = new boolean[size];
+ transitions = new int[size * points.length];
+ for (int n = 0; n < size * points.length; n++)
+ transitions[n] = -1;
+ for (State s : states) {
+ int n = s.number;
+ accept[n] = s.accept;
+ for (int c = 0; c < points.length; c++) {
+ State q = s.step(points[c]);
+ if (q != null) transitions[n * points.length + c] = q.number;
+ }
+ }
+ /*
+ * Set alphabet table for optimal run performance.
+ */
+ classmap = new int[Character.MAX_VALUE + 1];
+ int i = 0;
+ for (int j = 0; j <= Character.MAX_VALUE; j++) {
+ if (i + 1 < points.length && j == points[i + 1]) i++;
+ classmap[j] = i;
+ }
+ }
+
+ /**
+ * Returns the state obtained by reading the given char from the given state.
+ * Returns -1 if not obtaining any such state. (If the original
+ * Automaton had no dead states, -1 is returned here if and only
+ * if a dead state is entered in an equivalent automaton with a total
+ * transition function.)
+ */
+ public int step(int state, char c) {
+ return transitions[state * points.length + classmap[c]];
+ }
+
+ /**
+ * Returns true if the given string is accepted by this automaton.
+ */
+ public boolean run(String s) {
+ int p = initial;
+ int l = s.length();
+ for (int i = 0; i < l; i++) {
+ p = step(p, s.charAt(i));
+ if (p == -1) return false;
+ }
+ return accept[p];
+ }
+
+ /**
+ * Returns true if the given string is accepted by this automaton
+ */
+ public boolean run(char[] s, int offset, int length) {
+ int p = initial;
+ int l = offset + length;
+ for (int i = offset; i < l; i++) {
+ p = step(p, s[i]);
+ if (p == -1) return false;
+ }
+ return accept[p];
+ }
+
+ /**
+ * Returns the length of the longest accepted run of the given string starting
+ * at the given offset.
+ *
+ * @param s the string
+ * @param offset offset into s where the run starts
+ * @return length of the longest accepted run, -1 if no run is accepted
+ */
+ public int run(String s, int offset) {
+ int p = initial;
+ int l = s.length();
+ int max = -1;
+ for (int r = 0; offset <= l; offset++, r++) {
+ if (accept[p]) max = r;
+ if (offset == l) break;
+ p = step(p, s.charAt(offset));
+ if (p == -1) break;
+ }
+ return max;
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\RunAutomaton.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/SpecialOperations.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 0)
@@ -0,0 +1,182 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Special automata operations.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class SpecialOperations {
+
+ private SpecialOperations() {}
+
+ /**
+ * Finds the largest entry whose value is less than or equal to c, or 0 if
+ * there is no such entry.
+ */
+ static int findIndex(char c, char[] points) {
+ int a = 0;
+ int b = points.length;
+ while (b - a > 1) {
+ int d = (a + b) >>> 1;
+ if (points[d] > c) b = d;
+ else if (points[d] < c) a = d;
+ else return d;
+ }
+ return a;
+ }
+
+ /**
+ * Returns true if the language of this automaton is finite.
+ */
+ public static boolean isFinite(Automaton a) {
+ if (a.isSingleton()) return true;
+ return isFinite(a.initial, new HashSet());
+ }
+
+ /**
+ * Checks whether there is a loop containing s. (This is sufficient since
+ * there are never transitions to dead states.)
+ */
+ private static boolean isFinite(State s, HashSet path) {
+ path.add(s);
+ for (Transition t : s.transitions)
+ if (path.contains(t.to) || !isFinite(t.to, path)) return false;
+ path.remove(s);
+ return true;
+ }
+
+ /**
+ * Returns the longest string that is a prefix of all accepted strings and
+ * visits each state at most once.
+ *
+ * @return common prefix
+ */
+ public static String getCommonPrefix(Automaton a) {
+ if (a.isSingleton()) return a.singleton;
+ StringBuilder b = new StringBuilder();
+ HashSet visited = new HashSet();
+ State s = a.initial;
+ boolean done;
+ do {
+ done = true;
+ visited.add(s);
+ if (!s.accept && s.transitions.size() == 1) {
+ Transition t = s.transitions.iterator().next();
+ if (t.min == t.max && !visited.contains(t.to)) {
+ b.append(t.min);
+ s = t.to;
+ done = false;
+ }
+ }
+ } while (!done);
+ return b.toString();
+ }
+
+ /**
+ * Returns the longest string that is a suffix of all accepted strings and
+ * visits each state at most once.
+ *
+ * @return common suffix
+ */
+ public static String getCommonSuffix(Automaton a) {
+ if (a.isSingleton()) // if singleton, the suffix is the string itself.
+ return a.singleton;
+
+ // reverse the language of the automaton, then reverse its common prefix.
+ Automaton r = a.clone();
+ reverse(r);
+ r.determinize();
+ return reverseUnicode3(SpecialOperations.getCommonPrefix(r));
+ }
+
+ /**
+ * Reverses the language of the given (non-singleton) automaton while returning
+ * the set of new initial states.
+ */
+ private static Set reverse(Automaton a) {
+ a.expandSingleton();
+ // reverse all edges
+ HashMap> m = new HashMap>();
+ Set states = a.getStates();
+ Set accept = a.getAcceptStates();
+ for (State r : states) {
+ m.put(r, new HashSet());
+ r.accept = false;
+ }
+ for (State r : states)
+ for (Transition t : r.getTransitions())
+ m.get(t.to).add(new Transition(t.min, t.max, r));
+ for (State r : states)
+ r.transitions = m.get(r);
+ // make new initial+final states
+ a.initial.accept = true;
+ a.initial = new State();
+ for (State r : accept)
+ a.initial.addEpsilon(r); // ensures that all initial states are reachable
+ a.deterministic = false;
+ return accept;
+ }
+
+ /**
+ * Intentionally use a unicode 3 reverse.
+ * This is because we are only going to reverse it again...
+ */
+ private static String reverseUnicode3( final String input ){
+ char[] charInput = input.toCharArray();
+ reverseUnicode3(charInput, 0, charInput.length);
+ return new String(charInput);
+ }
+
+ /**
+ * Intentionally use a unicode 3 reverse.
+ * This is because it is only used by getCommonSuffix(),
+ * which will reverse the entire FSM using code unit reversal,
+ * so we must then reverse its common prefix back using the
+ * same code point reversal.
+ */
+ private static void reverseUnicode3(char[] buffer, int start, int len){
+ if (len <= 1) return;
+ int num = len>>1;
+ for (int i = start; i < ( start + num ); i++) {
+ char c = buffer[i];
+ buffer[i] = buffer[start * 2 + len - i - 1];
+ buffer[start * 2 + len - i - 1] = c;
+ }
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\SpecialOperations.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/State.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/State.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/State.java (revision 0)
@@ -0,0 +1,214 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Automaton state.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class State implements Serializable, Comparable {
+
+ static final long serialVersionUID = 30001;
+
+ boolean accept;
+ Set transitions;
+
+ int number;
+
+ int id;
+ static int next_id;
+
+ /**
+ * Constructs a new state. Initially, the new state is a reject state.
+ */
+ public State() {
+ resetTransitions();
+ id = next_id++;
+ }
+
+ /**
+ * Resets transition set.
+ */
+ final void resetTransitions() {
+ transitions = new HashSet();
+ }
+
+ /**
+ * Returns the set of outgoing transitions. Subsequent changes are reflected
+ * in the automaton.
+ *
+ * @return transition set
+ */
+ public Set getTransitions() {
+ return transitions;
+ }
+
+ /**
+ * Adds an outgoing transition.
+ *
+ * @param t transition
+ */
+ public void addTransition(Transition t) {
+ transitions.add(t);
+ }
+
+ /**
+ * Sets acceptance for this state.
+ *
+ * @param accept if true, this state is an accept state
+ */
+ public void setAccept(boolean accept) {
+ this.accept = accept;
+ }
+
+ /**
+ * Returns acceptance status.
+ *
+ * @return true is this is an accept state
+ */
+ public boolean isAccept() {
+ return accept;
+ }
+
+ /**
+ * Performs lookup in transitions, assuming determinism.
+ *
+ * @param c character to look up
+ * @return destination state, null if no matching outgoing transition
+ * @see #step(char, Collection)
+ */
+ public State step(char c) {
+ for (Transition t : transitions)
+ if (t.min <= c && c <= t.max) return t.to;
+ return null;
+ }
+
+ /**
+ * Performs lookup in transitions, allowing nondeterminism.
+ *
+ * @param c character to look up
+ * @param dest collection where destination states are stored
+ * @see #step(char)
+ */
+ public void step(char c, Collection dest) {
+ for (Transition t : transitions)
+ if (t.min <= c && c <= t.max) dest.add(t.to);
+ }
+
+ void addEpsilon(State to) {
+ if (to.accept) accept = true;
+ for (Transition t : to.transitions)
+ transitions.add(t);
+ }
+
+ /**
+ * Returns transitions sorted by (min, reverse max, to) or (to, min, reverse
+ * max)
+ */
+ public Transition[] getSortedTransitionArray(boolean to_first) {
+ Transition[] e = transitions.toArray(new Transition[transitions.size()]);
+ Arrays.sort(e, new TransitionComparator(to_first));
+ return e;
+ }
+
+ /**
+ * Returns sorted list of outgoing transitions.
+ *
+ * @param to_first if true, order by (to, min, reverse max); otherwise (min,
+ * reverse max, to)
+ * @return transition list
+ */
+ public List getSortedTransitions(boolean to_first) {
+ return Arrays.asList(getSortedTransitionArray(to_first));
+ }
+
+
+ /**
+ * Return this state's number.
+ *
+ * Expert: Will be useless unless {@link Automaton#setStateNumbers(Set)}
+ * has been called first to number the states.
+ * @return the number
+ */
+ public int getNumber() {
+ return number;
+ }
+
+ /**
+ * Returns string describing this state. Normally invoked via
+ * {@link Automaton#toString()}.
+ */
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ b.append("state ").append(number);
+ if (accept) b.append(" [accept]");
+ else b.append(" [reject]");
+ b.append(":\n");
+ for (Transition t : transitions)
+ b.append(" ").append(t.toString()).append("\n");
+ return b.toString();
+ }
+
+ /**
+ * Compares this object with the specified object for order. States are
+ * ordered by the time of construction.
+ */
+ public int compareTo(State s) {
+ return s.id - id;
+ }
+
+ /**
+ * See {@link java.lang.Object#equals(java.lang.Object)}.
+ */
+ @Override
+ public boolean equals(Object obj) {
+ return super.equals(obj);
+ }
+
+ /**
+ * See {@link java.lang.Object#hashCode()}.
+ */
+ @Override
+ public int hashCode() {
+ return super.hashCode();
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\State.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/StatePair.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/StatePair.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/StatePair.java (revision 0)
@@ -0,0 +1,104 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+/**
+ * Pair of states.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class StatePair {
+ State s;
+ State s1;
+ State s2;
+
+ StatePair(State s, State s1, State s2) {
+ this.s = s;
+ this.s1 = s1;
+ this.s2 = s2;
+ }
+
+ /**
+ * Constructs a new state pair.
+ *
+ * @param s1 first state
+ * @param s2 second state
+ */
+ public StatePair(State s1, State s2) {
+ this.s1 = s1;
+ this.s2 = s2;
+ }
+
+ /**
+ * Returns first component of this pair.
+ *
+ * @return first state
+ */
+ public State getFirstState() {
+ return s1;
+ }
+
+ /**
+ * Returns second component of this pair.
+ *
+ * @return second state
+ */
+ public State getSecondState() {
+ return s2;
+ }
+
+ /**
+ * Checks for equality.
+ *
+ * @param obj object to compare with
+ * @return true if obj represents the same pair of states as this
+ * pair
+ */
+ @Override
+ public boolean equals(Object obj) {
+ if (obj instanceof StatePair) {
+ StatePair p = (StatePair) obj;
+ return p.s1 == s1 && p.s2 == s2;
+ } else return false;
+ }
+
+ /**
+ * Returns hash code.
+ *
+ * @return hash code
+ */
+ @Override
+ public int hashCode() {
+ return s1.hashCode() + s2.hashCode();
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\StatePair.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/Transition.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/Transition.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/Transition.java (revision 0)
@@ -0,0 +1,179 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+
+/**
+ * Automaton transition.
+ *
+ * A transition, which belongs to a source state, consists of a Unicode
+ * character interval and a destination state.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class Transition implements Serializable, Cloneable {
+
+ static final long serialVersionUID = 40001;
+
+ /*
+ * CLASS INVARIANT: min<=max
+ */
+
+ char min;
+ char max;
+
+ State to;
+
+ /**
+ * Constructs a new singleton interval transition.
+ *
+ * @param c transition character
+ * @param to destination state
+ */
+ public Transition(char c, State to) {
+ min = max = c;
+ this.to = to;
+ }
+
+ /**
+ * Constructs a new transition. Both end points are included in the interval.
+ *
+ * @param min transition interval minimum
+ * @param max transition interval maximum
+ * @param to destination state
+ */
+ public Transition(char min, char max, State to) {
+ if (max < min) {
+ char t = max;
+ max = min;
+ min = t;
+ }
+ this.min = min;
+ this.max = max;
+ this.to = to;
+ }
+
+ /** Returns minimum of this transition interval. */
+ public char getMin() {
+ return min;
+ }
+
+ /** Returns maximum of this transition interval. */
+ public char getMax() {
+ return max;
+ }
+
+ /** Returns destination of this transition. */
+ public State getDest() {
+ return to;
+ }
+
+ /**
+ * Checks for equality.
+ *
+ * @param obj object to compare with
+ * @return true if obj is a transition with same character interval
+ * and destination state as this transition.
+ */
+ @Override
+ public boolean equals(Object obj) {
+ if (obj instanceof Transition) {
+ Transition t = (Transition) obj;
+ return t.min == min && t.max == max && t.to == to;
+ } else return false;
+ }
+
+ /**
+ * Returns hash code. The hash code is based on the character interval (not
+ * the destination state).
+ *
+ * @return hash code
+ */
+ @Override
+ public int hashCode() {
+ return min * 2 + max * 3;
+ }
+
+ /**
+ * Clones this transition.
+ *
+ * @return clone with same character interval and destination state
+ */
+ @Override
+ public Transition clone() {
+ try {
+ return (Transition) super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ static void appendCharString(char c, StringBuilder b) {
+ if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.append(c);
+ else {
+ b.append("\\u");
+ String s = Integer.toHexString(c);
+ if (c < 0x10) b.append("000").append(s);
+ else if (c < 0x100) b.append("00").append(s);
+ else if (c < 0x1000) b.append("0").append(s);
+ else b.append(s);
+ }
+ }
+
+ /**
+ * Returns a string describing this state. Normally invoked via
+ * {@link Automaton#toString()}.
+ */
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ appendCharString(min, b);
+ if (min != max) {
+ b.append("-");
+ appendCharString(max, b);
+ }
+ b.append(" -> ").append(to.number);
+ return b.toString();
+ }
+
+ void appendDot(StringBuilder b) {
+ b.append(" -> ").append(to.number).append(" [label=\"");
+ appendCharString(min, b);
+ if (min != max) {
+ b.append("-");
+ appendCharString(max, b);
+ }
+ b.append("\"]\n");
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\Transition.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/TransitionComparator.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 0)
@@ -0,0 +1,80 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Comparator;
+
+/**
+ * Comparator for state {@link Transition}s that orders unicode char range
+ * transitions in lexicographic order.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+class TransitionComparator implements Comparator, Serializable {
+
+ static final long serialVersionUID = 10001;
+
+ boolean to_first;
+
+ TransitionComparator(boolean to_first) {
+ this.to_first = to_first;
+ }
+
+ /**
+ * Compares by (min, reverse max, to) or (to, min, reverse max).
+ */
+ public int compare(Transition t1, Transition t2) {
+ if (to_first) {
+ if (t1.to != t2.to) {
+ if (t1.to == null) return -1;
+ else if (t2.to == null) return 1;
+ else if (t1.to.number < t2.to.number) return -1;
+ else if (t1.to.number > t2.to.number) return 1;
+ }
+ }
+ if (t1.min < t2.min) return -1;
+ if (t1.min > t2.min) return 1;
+ if (t1.max > t2.max) return -1;
+ if (t1.max < t2.max) return 1;
+ if (!to_first) {
+ if (t1.to != t2.to) {
+ if (t1.to == null) return -1;
+ else if (t2.to == null) return 1;
+ else if (t1.to.number < t2.to.number) return -1;
+ else if (t1.to.number > t2.to.number) return 1;
+ }
+ }
+ return 0;
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\TransitionComparator.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/UnicodeUtil.java
===================================================================
--- src/java/org/apache/lucene/util/UnicodeUtil.java (revision 888316)
+++ src/java/org/apache/lucene/util/UnicodeUtil.java (working copy)
@@ -374,36 +374,53 @@
* @return next valid UTF-16 String in UTF-16 order
*/
public static String nextValidUTF16String(String s) {
- final int size = s.length();
+ if (validUTF16String(s))
+ return s;
+ else {
+ UTF16Result chars = new UTF16Result();
+ chars.copyText(s);
+ nextValidUTF16String(chars);
+ return new String(chars.result, 0, chars.length);
+ }
+ }
+
+ public static void nextValidUTF16String(UTF16Result s) {
+ final int size = s.length;
for (int i = 0; i < size; i++) {
- char ch = s.charAt(i);
+ char ch = s.result[i];
if (ch >= UnicodeUtil.UNI_SUR_HIGH_START
&& ch <= UnicodeUtil.UNI_SUR_HIGH_END) {
if (i < size - 1) {
i++;
- char nextCH = s.charAt(i);
+ char nextCH = s.result[i];
if (nextCH >= UnicodeUtil.UNI_SUR_LOW_START
&& nextCH <= UnicodeUtil.UNI_SUR_LOW_END) {
// Valid surrogate pair
} else
// Unmatched high surrogate
- if (nextCH < UnicodeUtil.UNI_SUR_LOW_START) // SMP not enumerated
- return s.substring(0, i) +
- (char) UnicodeUtil.UNI_SUR_LOW_START;
- else // SMP already enumerated
- return s.substring(0, i - 1) +
- (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
- } else
+ if (nextCH < UnicodeUtil.UNI_SUR_LOW_START) { // SMP not enumerated
+ s.setLength(i + 1);
+ s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START;
+ return;
+ } else { // SMP already enumerated
+ s.setLength(i);
+ s.result[i - 1] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
+ return;
+ }
+ } else {
// Unmatched high surrogate in final position, SMP not yet enumerated
- return s + (char) UnicodeUtil.UNI_SUR_LOW_START;
+ s.setLength(i + 2);
+ s.result[i + 1] = (char) UnicodeUtil.UNI_SUR_LOW_START;
+ return;
+ }
} else if (ch >= UnicodeUtil.UNI_SUR_LOW_START
- && ch <= UnicodeUtil.UNI_SUR_LOW_END)
+ && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
// Unmatched low surrogate, SMP already enumerated
- return s.substring(0, i) +
- (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
+ s.setLength(i + 1);
+ s.result[i] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
+ return;
+ }
}
-
- return s;
}
// Only called from assert
@@ -460,7 +477,7 @@
return false;
}
}
-
+ */
public static final boolean validUTF16String(String s) {
final int size = s.length();
for(int i=0;i"));
+ assertEquals(0, regexQueryNrHits("<493433-600000>"));
+ }
+
+ public void testRegexComplement() throws IOException {
+ assertEquals(1, regexQueryNrHits("4934~[3]"));
+ // not the empty lang, i.e. match all docs
+ assertEquals(1, regexQueryNrHits("~#"));
+ }
+
+ public void testCustomProvider() throws IOException {
+ AutomatonProvider myProvider = new AutomatonProvider() {
+ // automaton that matches quick or brown
+ private Automaton quickBrownAutomaton = BasicOperations.union(
+ Arrays.asList(new Automaton[] {
+ BasicAutomata.makeString("quick"),
+ BasicAutomata.makeString("brown"),
+ BasicAutomata.makeString("bob")}));
+
+ public Automaton getAutomaton(String name) throws IOException {
+ if (name.equals("quickBrown"))
+ return quickBrownAutomaton;
+ else
+ return null;
+ }
+ };
+ RegexpQuery query = new RegexpQuery(newTerm(""), RegExp.ALL, myProvider);
+ assertEquals(1, searcher.search(query, 5).totalHits);
+ }
+
+ /**
+ * Test a corner case for backtracking:
+ * In this case the term dictionary has 493432 followed by 49344.
+ * When backtracking from 49343... to 4934, its necessary
+ * to test that 4934 itself is ok before trying to append more characters.
+ */
+ public void testBacktracking() throws IOException {
+ assertEquals(1, regexQueryNrHits("4934[314]"));
+ }
+}
+
Property changes on: src\test\org\apache\lucene\search\TestRegexpQuery.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/test/org/apache/lucene/search/TestRegexpRandom.java
===================================================================
--- src/test/org/apache/lucene/search/TestRegexpRandom.java (revision 0)
+++ src/test/org/apache/lucene/search/TestRegexpRandom.java (revision 0)
@@ -0,0 +1,144 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Random;
+
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Create an index with terms from 0000-9999.
+ * Generates random regexps according to simple patterns,
+ * and validates the correct number of hits are returned.
+ */
+public class TestRegexpRandom extends LuceneTestCase {
+ private Searcher searcher;
+ private Random random;
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ RAMDirectory dir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
+
+ Document doc = new Document();
+ Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
+ doc.add(field);
+
+ NumberFormat df = new DecimalFormat("0000");
+ for (int i = 0; i < 10000; i++) {
+ field.setValue(df.format(i));
+ writer.addDocument(doc);
+ }
+
+ writer.optimize();
+ writer.close();
+ searcher = new IndexSearcher(dir);
+ }
+
+ private char N() {
+ return (char) (0x30 + random.nextInt(10));
+ }
+
+ private String fillPattern(String wildcardPattern) {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < wildcardPattern.length(); i++) {
+ switch(wildcardPattern.charAt(i)) {
+ case 'N':
+ sb.append(N());
+ break;
+ default:
+ sb.append(wildcardPattern.charAt(i));
+ }
+ }
+ return sb.toString();
+ }
+
+ private void assertPatternHits(String pattern, int numHits) throws Exception {
+ Query wq = new RegexpQuery(new Term("field", fillPattern(pattern)));
+ TopDocs docs = searcher.search(wq, 25);
+ assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits);
+ }
+
+ @Override
+ protected void tearDown() throws Exception {
+ searcher.close();
+ super.tearDown();
+ }
+
+ public void testRegexps() throws Exception {
+ random = newRandom(System.nanoTime());
+ for (int i = 0; i < 100; i++) {
+ assertPatternHits("NNNN", 1);
+ assertPatternHits(".NNN", 10);
+ assertPatternHits("N.NN", 10);
+ assertPatternHits("NN.N", 10);
+ assertPatternHits("NNN.", 10);
+ }
+
+ for (int i = 0; i < 10; i++) {
+ assertPatternHits(".{1,2}NN", 100);
+ assertPatternHits("N.{1,2}N", 100);
+ assertPatternHits("NN.{1,2}", 100);
+ assertPatternHits(".{1,3}N", 1000);
+ assertPatternHits("N.{1,3}", 1000);
+ assertPatternHits(".{1,4}", 10000);
+
+ assertPatternHits("NNN[3-7]", 5);
+ assertPatternHits("NN[2-6][3-7]", 25);
+ assertPatternHits("N[1-5][2-6][3-7]", 125);
+ assertPatternHits("[0-4][3-7][4-8][5-9]", 625);
+ assertPatternHits("[3-7][2-6][0-4]N", 125);
+ assertPatternHits("[2-6][3-7]NN", 25);
+ assertPatternHits("[3-7]NNN", 5);
+
+ assertPatternHits("NNN.*", 10);
+ assertPatternHits("NN.*", 100);
+ assertPatternHits("N.*", 1000);
+ assertPatternHits(".*", 10000);
+
+ assertPatternHits(".*NNN", 10);
+ assertPatternHits(".*NN", 100);
+ assertPatternHits(".*N", 1000);
+
+ assertPatternHits("N.*NN", 10);
+ assertPatternHits("NN.*N", 10);
+
+ // combo of ? and * operators
+ assertPatternHits(".NN.*", 100);
+ assertPatternHits("N.N.*", 100);
+ assertPatternHits("NN..*", 100);
+ assertPatternHits(".N..*", 1000);
+ assertPatternHits("N...*", 1000);
+
+ assertPatternHits(".*NN.", 100);
+ assertPatternHits(".*N..", 1000);
+ assertPatternHits(".*...", 10000);
+ assertPatternHits(".*.N.", 1000);
+ assertPatternHits(".*..N", 1000);
+ }
+ }
+}
Property changes on: src\test\org\apache\lucene\search\TestRegexpRandom.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/test/org/apache/lucene/search/TestWildcard.java
===================================================================
--- src/test/org/apache/lucene/search/TestWildcard.java (revision 888316)
+++ src/test/org/apache/lucene/search/TestWildcard.java (working copy)
@@ -24,6 +24,7 @@
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
@@ -120,14 +121,12 @@
MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*"));
assertMatches(searcher, wq, 2);
- assertTrue(wq.getEnum(searcher.getIndexReader()) instanceof PrefixTermEnum);
assertTrue(wq.getTermsEnum(searcher.getIndexReader()) instanceof PrefixTermsEnum);
wq = new WildcardQuery(new Term("field", "*"));
assertMatches(searcher, wq, 2);
- assertTrue(wq.getEnum(searcher.getIndexReader()) instanceof PrefixTermEnum);
assertFalse(wq.getTermsEnum(searcher.getIndexReader()) instanceof PrefixTermsEnum);
- assertFalse(wq.getTermsEnum(searcher.getIndexReader()) instanceof WildcardTermsEnum);
+ assertFalse(wq.getTermsEnum(searcher.getIndexReader()) instanceof AutomatonTermsEnum);
}
/**
@@ -309,5 +308,56 @@
searcher.close();
}
+ @Deprecated
+ private static final class OldWildcardQuery extends MultiTermQuery {
+ final Term term;
+ OldWildcardQuery(Term term) {
+ this.term = term;
+ }
+
+ @Override
+ protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
+ return new WildcardTermEnum(reader, term);
+ }
+
+ @Override
+ public String toString(String field) {
+ return "OldWildcard(" + term.toString()+ ")";
+ }
+ }
+
+ @Deprecated
+ public void testDeprecatedTermEnum() throws Exception {
+ RAMDirectory indexStore = getIndexStore("body", new String[]
+ {"metal", "metals"});
+ IndexSearcher searcher = new IndexSearcher(indexStore, true);
+ Query query1 = new TermQuery(new Term("body", "metal"));
+ Query query2 = new OldWildcardQuery(new Term("body", "metal*"));
+ Query query3 = new OldWildcardQuery(new Term("body", "m*tal"));
+ Query query4 = new OldWildcardQuery(new Term("body", "m*tal*"));
+ Query query5 = new OldWildcardQuery(new Term("body", "m*tals"));
+
+ BooleanQuery query6 = new BooleanQuery();
+ query6.add(query5, BooleanClause.Occur.SHOULD);
+
+ BooleanQuery query7 = new BooleanQuery();
+ query7.add(query3, BooleanClause.Occur.SHOULD);
+ query7.add(query5, BooleanClause.Occur.SHOULD);
+
+ // Queries do not automatically lower-case search terms:
+ Query query8 = new OldWildcardQuery(new Term("body", "M*tal*"));
+
+ assertMatches(searcher, query1, 1);
+ assertMatches(searcher, query2, 2);
+ assertMatches(searcher, query3, 1);
+ assertMatches(searcher, query4, 2);
+ assertMatches(searcher, query5, 1);
+ assertMatches(searcher, query6, 1);
+ assertMatches(searcher, query7, 2);
+ assertMatches(searcher, query8, 0);
+ assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tall")), 0);
+ assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal")), 1);
+ assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal*")), 2);
+ }
}
Index: src/test/org/apache/lucene/search/TestWildcardRandom.java
===================================================================
--- src/test/org/apache/lucene/search/TestWildcardRandom.java (revision 0)
+++ src/test/org/apache/lucene/search/TestWildcardRandom.java (revision 0)
@@ -0,0 +1,136 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Random;
+
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Create an index with terms from 0000-9999.
+ * Generates random wildcards according to patterns,
+ * and validates the correct number of hits are returned.
+ */
+public class TestWildcardRandom extends LuceneTestCase {
+ private Searcher searcher;
+ private Random random;
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ RAMDirectory dir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
+
+ Document doc = new Document();
+ Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
+ doc.add(field);
+
+ NumberFormat df = new DecimalFormat("0000");
+ for (int i = 0; i < 10000; i++) {
+ field.setValue(df.format(i));
+ writer.addDocument(doc);
+ }
+
+ writer.optimize();
+ writer.close();
+ searcher = new IndexSearcher(dir);
+ }
+
+ private char N() {
+ return (char) (0x30 + random.nextInt(10));
+ }
+
+ private String fillPattern(String wildcardPattern) {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < wildcardPattern.length(); i++) {
+ switch(wildcardPattern.charAt(i)) {
+ case 'N':
+ sb.append(N());
+ break;
+ default:
+ sb.append(wildcardPattern.charAt(i));
+ }
+ }
+ return sb.toString();
+ }
+
+ private void assertPatternHits(String pattern, int numHits) throws Exception {
+ Query wq = new WildcardQuery(new Term("field", fillPattern(pattern)));
+ TopDocs docs = searcher.search(wq, 25);
+ assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits);
+ }
+
+ @Override
+ protected void tearDown() throws Exception {
+ searcher.close();
+ super.tearDown();
+ }
+
+ public void testWildcards() throws Exception {
+ random = newRandom(System.nanoTime());
+ for (int i = 0; i < 100; i++) {
+ assertPatternHits("NNNN", 1);
+ assertPatternHits("?NNN", 10);
+ assertPatternHits("N?NN", 10);
+ assertPatternHits("NN?N", 10);
+ assertPatternHits("NNN?", 10);
+ }
+
+ for (int i = 0; i < 10; i++) {
+ assertPatternHits("??NN", 100);
+ assertPatternHits("N??N", 100);
+ assertPatternHits("NN??", 100);
+ assertPatternHits("???N", 1000);
+ assertPatternHits("N???", 1000);
+ assertPatternHits("????", 10000);
+
+ assertPatternHits("NNN*", 10);
+ assertPatternHits("NN*", 100);
+ assertPatternHits("N*", 1000);
+ assertPatternHits("*", 10000);
+
+ assertPatternHits("*NNN", 10);
+ assertPatternHits("*NN", 100);
+ assertPatternHits("*N", 1000);
+
+ assertPatternHits("N*NN", 10);
+ assertPatternHits("NN*N", 10);
+
+ // combo of ? and * operators
+ assertPatternHits("?NN*", 100);
+ assertPatternHits("N?N*", 100);
+ assertPatternHits("NN?*", 100);
+ assertPatternHits("?N?*", 1000);
+ assertPatternHits("N??*", 1000);
+
+ assertPatternHits("*NN?", 100);
+ assertPatternHits("*N??", 1000);
+ assertPatternHits("*???", 10000);
+ assertPatternHits("*?N?", 1000);
+ assertPatternHits("*??N", 1000);
+ }
+ }
+}
Property changes on: src\test\org\apache\lucene\search\TestWildcardRandom.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native