Index: src/test/org/apache/lucene/search/TestWildcard.java =================================================================== --- src/test/org/apache/lucene/search/TestWildcard.java (revision 883088) +++ src/test/org/apache/lucene/search/TestWildcard.java (working copy) @@ -24,6 +24,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.Index; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.QueryParser; @@ -326,4 +327,57 @@ searcher.close(); } + @Deprecated + private static final class OldWildcardQuery extends MultiTermQuery { + final Term term; + + OldWildcardQuery(Term term) { + this.term = term; + } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new WildcardTermEnum(reader, term); + } + + @Override + public String toString(String field) { + return "OldWildcard(" + term.toString()+ ")"; + } + } + + @Deprecated + public void testDeprecatedTermEnum() throws Exception { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals"}); + IndexSearcher searcher = new IndexSearcher(indexStore, true); + Query query1 = new TermQuery(new Term("body", "metal")); + Query query2 = new OldWildcardQuery(new Term("body", "metal*")); + Query query3 = new OldWildcardQuery(new Term("body", "m*tal")); + Query query4 = new OldWildcardQuery(new Term("body", "m*tal*")); + Query query5 = new OldWildcardQuery(new Term("body", "m*tals")); + + BooleanQuery query6 = new BooleanQuery(); + query6.add(query5, BooleanClause.Occur.SHOULD); + + BooleanQuery query7 = new BooleanQuery(); + query7.add(query3, BooleanClause.Occur.SHOULD); + query7.add(query5, BooleanClause.Occur.SHOULD); + + // Queries do not automatically lower-case search terms: + Query query8 = new OldWildcardQuery(new Term("body", "M*tal*")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 2); + assertMatches(searcher, query3, 1); + assertMatches(searcher, query4, 2); + assertMatches(searcher, query5, 1); + assertMatches(searcher, query6, 1); + assertMatches(searcher, query7, 2); + assertMatches(searcher, query8, 0); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tall")), 0); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal")), 1); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal*")), 2); + } + } Index: src/java/org/apache/lucene/search/RegexpQuery.java =================================================================== --- src/java/org/apache/lucene/search/RegexpQuery.java (revision 0) +++ src/java/org/apache/lucene/search/RegexpQuery.java (revision 0) @@ -0,0 +1,89 @@ +package org.apache.lucene.search; + +import java.io.IOException; + +import org.apache.lucene.index.Term; + +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.AutomatonProvider; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A fast regular expression query based on the + * {@link org.apache.lucene.util.automaton} package. + * + *

+ * The supported syntax is documented in the {@link RegExp} class. + *

+ *

+ * Note this query can be slow, as it needs to iterate over many terms. In order + * to prevent extremely slow RegexpQueries, a Regexp term should not start with + * the expression .* + * + * @see RegExp + */ +public class RegexpQuery extends AutomatonQuery { + /** + * A provider that provides no named automata + */ + private static AutomatonProvider defaultProvider = new AutomatonProvider() { + public Automaton getAutomaton(String name) throws IOException { + return null; + } + }; + + /** + * Constructs a query for terms matching term. + *

+ * By default, all regular expression features are enabled. + *

+ * + * @param term regular expression. + */ + public RegexpQuery(Term term) { + this(term, RegExp.ALL); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param flags optional RegExp features from {@link RegExp} + */ + public RegexpQuery(Term term, int flags) { + this(term, flags, defaultProvider); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param flags optional RegExp features from {@link RegExp} + * @param provider custom AutomatonProvider for named automata + */ + public RegexpQuery(Term term, int flags, AutomatonProvider provider) { + super(term, new RegExp(term.text(), flags).toAutomaton(provider)); + } +} Property changes on: src/java/org/apache/lucene/search/RegexpQuery.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/search/AutomatonTermEnum.java =================================================================== --- src/java/org/apache/lucene/search/AutomatonTermEnum.java (revision 0) +++ src/java/org/apache/lucene/search/AutomatonTermEnum.java (revision 0) @@ -0,0 +1,361 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.SpecialOperations; + +/** + * A FilteredTermEnum that enumerates terms based upon what is accepted by a + * DFA. + *

+ * The algorithm is such: + *

    + *
  1. As long as matches are successful, keep reading sequentially. + *
  2. When a match fails, skip to the next string in lexicographic order that + * does not enter a reject state. + *
+ *

+ * The algorithm does not attempt to actually skip to the next string that is + * completely accepted. This is not possible when the language accepted by the + * FSM is not finite (i.e. * operator). + *

+ *

+ * If the DFA has a leading kleene star, or something similar, it will + * need to run against the entire term dictionary. In this case its much + * better to do just that than to use smart enumeration. + * This heuristic looks for an initial loop, with a range of at least 1/3 + * of the unicode BMP. + * Use {@link #usesLinearMode} to find out, if it enumerates all terms + * in linear mode without seeking. + *

+ */ +public class AutomatonTermEnum extends FilteredTermEnum { + private final IndexReader reader; + private final Term queryTerm; + private final RunAutomaton runAutomaton; + private final Automaton automaton; + private final boolean linearMode; + private final String commonPrefix; + // the last term that was compared + private Term lastTerm = null; + private boolean endEnum = false; + // for complex machines that must make a lot of comparisons + private final Map transitionCache; + + /** + * Construct an enumerator based upon an automaton, enumerating the specified + * field, working on a supplied reader. + */ + AutomatonTermEnum(Automaton automaton, Term queryTerm, IndexReader reader) + throws IOException { + super(); + this.reader = reader; + this.queryTerm = queryTerm; + this.automaton = automaton; + + /* + * If the DFA has a leading kleene star, or something similar, it will + * need to run against the entire term dictionary. In this case its much + * better to do just that than to use smart enumeration. + * + * this heuristic looks for an initial loop, with a range of at least 1/3 + * of the unicode BMP. + */ + State initialState = automaton.getInitialState(); + boolean linearMode = false; + for (Transition transition : initialState.getTransitions()) { + if (transition.getDest() == initialState && (transition.getMax() - transition.getMin()) > (Character.MAX_VALUE / 3)) { + linearMode = true; + break; + } + } + this.linearMode = linearMode; + + /* + * tableize the automaton. this also ensures it is deterministic, and has no + * transitions to dead states. + */ + runAutomaton = new RunAutomaton(this.automaton); + + if (this.linearMode) { + // iterate all terms in linear mode + commonPrefix = SpecialOperations.getCommonPrefix(automaton); + transitionCache = null; + lastTerm = queryTerm.createTerm(commonPrefix); + + } else { + commonPrefix = ""; + + // build a cache of sorted transitions for every state + transitionCache = new HashMap(runAutomaton.getSize()); + for (State state : this.automaton.getStates()) { + List transitions = state.getSortedTransitions(false); + transitionCache.put(state, transitions.toArray(new Transition[transitions.size()])); + } + + String startPoint = nextString(""); + + /* + * in this case this automaton will not accept any strings. + * start the enumeration at the empty string, next() will return false. + */ + if (startPoint == null) { + startPoint = ""; + } + + lastTerm = queryTerm.createTerm(startPoint); + } + + setEnum(reader.terms(lastTerm)); + } + + /** + * Returns {@code true} if the enum is in linear mode, {@code false} in smart mode. + */ + public final boolean usesLinearMode() { + return linearMode; + } + + @Override + public float difference() { + return 1.0f; + } + + /** + * Returns true if the term matches the automaton. Also stashes away the term + * to assist with smart enumeration. + *

In linear mode, it also sets {@link #endEnum} if the enumeration is exhausted. + * In smart mode, it will never do this. + */ + @Override + protected boolean termCompare(final Term term) { + lastTerm = term; + final String text = term.text(); + if (term.field() == queryTerm.field() && (!linearMode || text.startsWith(commonPrefix))) { + return runAutomaton.run(text); + } else { + // only set endEnum in linearMode + endEnum = linearMode; + return false; + } + } + + /** + * In smart mode, increments to the next term matching this automaton. + * After a successful comparison, it simply tries the next term. + * After an unsuccessful comparison, it seeks to a smarter position. + *

If the enum is in linear mode, it simply calls {@code super.next()} to + * just filter the current enum until {@link #endEnum} returns {@code true}. + */ + @Override + public boolean next() throws IOException { + if (linearMode) + return super.next(); + + do { + /* + * if the previous enumeration was a match, don't even bother + * trying to compute the next place to seek to. + * this is an optimization for a DFA that matches many sequential terms, + * such as ab* + */ + if (lastTerm == currentTerm) { + actualEnum.next(); + } else { + // seek to the next possible string + String nextPoint = nextString(lastTerm.text()); + if (nextPoint == null) { // no more possible strings can match + currentTerm = null; + endEnum = true; + return false; + } + // replace the old enumerator with a new one, positioned to a nice place + actualEnum.close(); + actualEnum = reader.terms(lastTerm.createTerm(nextPoint)); + } + + Term candidateTerm = actualEnum.term(); // read a term + + /* + * this means end of enumeration: no more terms for this field or no more + * terms at all + */ + if (candidateTerm == null || candidateTerm.field() != queryTerm.field()) { + currentTerm = null; + endEnum = true; + return false; + } + + // if the term matches the automaton, success! + if (termCompare(candidateTerm)) { + currentTerm = candidateTerm; + return true; + } + } while (true); + } + + /** + * This method should only be called in linear mode, in smart + * mode the result is undefined, as the handling of exhausted enums + * is done inside {@link #next}. + */ + @Override + protected boolean endEnum() { + assert linearMode : "endEnum() should only be called in linear mode"; + return endEnum; + } + + /** + * Returns the next String in lexicographic order after s that will not put + * the machine into a reject state. If such a string does not exist, returns + * null. + * + * The correctness of this method depends upon the automaton being deterministic, + * and having no transitions to dead states. + * + * @param s input String + * @return next valid String + */ + private String nextString(String s) { + State state; + int pos = 0; + + while (true) { + state = automaton.getInitialState(); + // walk the automaton until a character is rejected. + for (pos = 0; pos < s.length(); pos++) { + State nextState = step(state, s.charAt(pos)); + if (nextState == null) + break; + else + state = nextState; + } + + // take the useful portion, and the last non-reject state, and attempt to + // append characters that will match. + String nextString = nextString(s, state, pos); + if (nextString != null) { + return nextString; + } else { /* no more solutions exist from this useful portion, backtrack */ + if (pos == 0) /* all solutions exhausted */ + return null; + char nextChar = s.charAt(pos - 1); + nextChar++; + String sprime = s.substring(0, pos - 1) + nextChar; + // if this is accepted it is good to go as-is. + if (runAutomaton.run(sprime)) + return sprime; + else + s = sprime; + } + } + } + + /** + * Returns the next String in lexicographic order after s that will not put + * the machine into a reject state. Appends some characters to the useful + * portion. If this cannot satisfy the machine, returns null. This method will + * walk the minimal path, in lexicographic order, as long as possible. + * + * @param s input String + * @param state current non-reject state + * @param useful most useful portion of the string + * @return next valid String + */ + private String nextString(String s, State state, int useful) { + /* + * the next lexicographic character must be greater than the existing + * character, if it exists. + */ + char c = 0; + if (useful < s.length()) { + c = s.charAt(useful); + c++; // cannot overflow as U+FFFF cannot be in the index. + } + + StringBuilder sb = new StringBuilder(); + // append the useful portion + sb.append(s, 0, useful); + + Set visited = new HashSet(); + visited.add(state); + + Transition transitions[] = getTransitions(state); + + // find the minimal path (lexicographic order) that is >= c + + for (int i = 0; i < transitions.length; i++) { + Transition transition = transitions[i]; + if (transition.getMax() >= c) { + char nextChar = (char) Math.max(c, transition.getMin()); + sb.append(nextChar); + state = transition.getDest(); + /* + * as long as is possible, continue down the minimal path in + * lexicographic order. if a loop or accept state is encountered, stop. + */ + while (!visited.contains(state) && !state.isAccept()) { + visited.add(state); + /* + * Note: we work with a DFA with no transitions to dead states. + * so the below is ok, if it is not an accept state, + * then there MUST be at least one transition. + */ + transition = getTransitions(state)[0]; + sb.append(transition.getMin()); + state = transition.getDest(); + } + return sb.toString(); + } + } + return null; + } + + /** + * Get the cached set of transitions for a state. + */ + private Transition[] getTransitions(State state) { + return transitionCache.get(state); + } + + /** + * Step the state machine forward one character, + * using cached transitions. + */ + private State step(State state, char c) { + Transition transitions[] = getTransitions(state); + for (int i = 0; i < transitions.length; i++) + if (transitions[i].getMin() <= c && c <= transitions[i].getMax()) + return transitions[i].getDest(); + return null; + } +} Property changes on: src/java/org/apache/lucene/search/AutomatonTermEnum.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/search/WildcardTermEnum.java =================================================================== --- src/java/org/apache/lucene/search/WildcardTermEnum.java (revision 883088) +++ src/java/org/apache/lucene/search/WildcardTermEnum.java (working copy) @@ -30,13 +30,7 @@ * the enumeration is greater than all that precede it. * @deprecated Please use {@link WildcardTermsEnum} instead. */ -public class WildcardTermEnum extends FilteredTermEnum { - final Term searchTerm; - final String field; - final String text; - final String pre; - final int preLen; - boolean endEnum = false; +public class WildcardTermEnum extends AutomatonTermEnum { /** * Creates a new WildcardTermEnum. @@ -45,60 +39,21 @@ * valid term if such a term exists. */ public WildcardTermEnum(IndexReader reader, Term term) throws IOException { - super(); - searchTerm = term; - field = searchTerm.field(); - final String searchTermText = searchTerm.text(); - - final int sidx = searchTermText.indexOf(WILDCARD_STRING); - final int cidx = searchTermText.indexOf(WILDCARD_CHAR); - int idx = sidx; - if (idx == -1) { - idx = cidx; - } - else if (cidx >= 0) { - idx = Math.min(idx, cidx); - } - pre = idx != -1?searchTerm.text().substring(0,idx): ""; - - preLen = pre.length(); - text = searchTermText.substring(preLen); - setEnum(reader.terms(new Term(searchTerm.field(), pre))); + super(WildcardQuery.toAutomaton(term), term, reader); } - @Override - protected final boolean termCompare(Term term) { - if (field == term.field()) { - String searchText = term.text(); - if (searchText.startsWith(pre)) { - return wildcardEquals(text, 0, searchText, preLen); - } - } - endEnum = true; - return false; - } + /** String equality with support for wildcards */ + public static final char WILDCARD_STRING = WildcardQuery.WILDCARD_STRING; - @Override - public float difference() { - return 1.0f; - } + /** Char equality with support for wildcards */ + public static final char WILDCARD_CHAR = WildcardQuery.WILDCARD_CHAR; - @Override - public final boolean endEnum() { - return endEnum; - } - - /******************************************** - * String equality with support for wildcards - ********************************************/ - - public static final char WILDCARD_STRING = '*'; - public static final char WILDCARD_CHAR = '?'; - /** * Determines if a word matches a wildcard pattern. * Work released by Granta Design Ltd after originally being done on * company time. + *

Note: This method is no longer used by this class! + * It is dead code and only available for backwards compatibility. */ public static final boolean wildcardEquals(String pattern, int patternIdx, String string, int stringIdx) Index: src/java/org/apache/lucene/search/AutomatonTermsEnum.java =================================================================== --- src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 0) +++ src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 0) @@ -0,0 +1,363 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermRef; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.Terms; + +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.SpecialOperations; + +/** + * A FilteredTermEnum that enumerates terms based upon what is accepted by a + * DFA. + *

+ * The algorithm is such: + *

    + *
  1. As long as matches are successful, keep reading sequentially. + *
  2. When a match fails, skip to the next string in lexicographic order that + * does not enter a reject state. + *
+ *

+ * The algorithm does not attempt to actually skip to the next string that is + * completely accepted. This is not possible when the language accepted by the + * FSM is not finite (i.e. * operator). + *

+ *

+ * If the DFA has a leading kleene star, or something similar, it will + * need to run against the entire term dictionary. In this case its much + * better to do just that than to use smart enumeration. + * This heuristic looks for an initial loop, with a range of at least 1/3 + * of the unicode BMP. + * Use {@link #usesLinearMode} to find out, if it enumerates all terms + * in linear mode without seeking. + *

+ */ +public class AutomatonTermsEnum extends FilteredTermsEnum { + private final RunAutomaton runAutomaton; + private final Automaton automaton; + private final boolean linearMode; + private final TermRef commonPrefixRef; + + // for complex machines that must make a lot of comparisons + private final Map transitionCache; + private final TermRef seekTermRef = new TermRef(); + private final String field; + + /** + * Construct an enumerator based upon an automaton, enumerating the specified + * field, working on a supplied reader. + */ + AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader) + throws IOException { + super(); + this.automaton = automaton; + field = queryTerm.field(); + + /* + * If the DFA has a leading kleene star, or something similar, it will + * need to run against the entire term dictionary. In this case its much + * better to do just that than to use smart enumeration. + * + * this heuristic looks for an initial loop, with a range of at least 1/3 + * of the unicode BMP. + */ + org.apache.lucene.util.automaton.State initialState = automaton.getInitialState(); + boolean linearMode = false; + for (Transition transition : initialState.getTransitions()) { + if (transition.getDest() == initialState && (transition.getMax() - transition.getMin()) > (Character.MAX_VALUE / 3)) { + linearMode = true; + break; + } + } + this.linearMode = linearMode; + + /* + * tableize the automaton. this also ensures it is deterministic, and has no + * transitions to dead states. + */ + runAutomaton = new RunAutomaton(this.automaton); + + if (this.linearMode) { + // iterate all terms in linear mode + commonPrefixRef = new TermRef(SpecialOperations.getCommonPrefix(automaton)); + transitionCache = null; + seekTermRef.copy(commonPrefixRef); + + } else { + commonPrefixRef = new TermRef(); + + // build a cache of sorted transitions for every state + transitionCache = new HashMap(runAutomaton.getSize()); + for (org.apache.lucene.util.automaton.State state : this.automaton.getStates()) { + List transitions = state.getSortedTransitions(false); + transitionCache.put(state, transitions.toArray(new Transition[transitions.size()])); + } + + String startPoint = nextString(""); + + /* + * in this case this automaton will not accept any strings. + * start the enumeration at the empty string, next() will return false. + */ + if (startPoint == null) { + startPoint = ""; + } + seekTermRef.copy(startPoint); + } + + Terms fieldTerms = reader.fields().terms(field); + if (fieldTerms != null) { + empty = setEnum(fieldTerms.iterator(), seekTermRef) == null; + } else { + empty = true; + } + } + + public String field() { + return field; + } + + @Override + public final boolean empty() { + return empty; + } + + private boolean empty; + + /** + * Returns {@code true} if the enum is in linear mode, {@code false} in smart mode. + */ + public final boolean usesLinearMode() { + return linearMode; + } + + @Override + public float difference() { + return 1.0f; + } + + /** + * Returns true if the term matches the automaton. Also stashes away the term + * to assist with smart enumeration. + *

In linear mode, it also sets {@link #endEnum} if the enumeration is exhausted. + * In smart mode, it will never do this. + */ + @Override + protected AcceptStatus accept(final TermRef term) { + if (!linearMode || term.startsWith(commonPrefixRef)) { + return runAutomaton.run(term.toString()) ? AcceptStatus.YES : AcceptStatus.NO; + } else { + return AcceptStatus.END; + } + } + + /** + * In smart mode, increments to the next term matching this automaton. + * After a successful comparison, it simply tries the next term. + * After an unsuccessful comparison, it seeks to a smarter position. + *

If the enum is in linear mode, it simply calls {@code super.next()} to + * just filter the current enum until {@link #endEnum} returns {@code true}. + */ + @Override + public TermRef next() throws IOException { + if (linearMode) + return super.next(); + + boolean first = true; + + do { + /* + * if the previous enumeration was a match, don't even bother + * trying to compute the next place to seek to. + * this is an optimization for a DFA that matches many sequential terms, + * such as ab* + */ + if (first) { + first = false; + if (actualEnum.next() == null) { + return null; + } + } else { + // seek to the next possible string + String nextPoint = nextString(actualEnum.term().toString()); + if (nextPoint == null) { // no more possible strings can match + return null; + } + // reposition + seekTermRef.copy(nextPoint); + if (actualEnum.seek(seekTermRef) == TermsEnum.SeekStatus.END) { + return null; + } + } + + TermRef candidateTerm = actualEnum.term(); + + /* + * this means end of enumeration: no more terms for this field or no more + * terms at all + */ + if (candidateTerm == null) { + return null; + } + + // if the term matches the automaton, success! + if (accept(candidateTerm) == AcceptStatus.YES) { + return candidateTerm; + } + } while (true); + } + + /** + * Returns the next String in lexicographic order after s that will not put + * the machine into a reject state. If such a string does not exist, returns + * null. + * + * The correctness of this method depends upon the automaton being deterministic, + * and having no transitions to dead states. + * + * @param s input String + * @return next valid String + */ + private String nextString(String s) { + org.apache.lucene.util.automaton.State state; + int pos = 0; + + while (true) { + state = automaton.getInitialState(); + // walk the automaton until a character is rejected. + for (pos = 0; pos < s.length(); pos++) { + org.apache.lucene.util.automaton.State nextState = step(state, s.charAt(pos)); + if (nextState == null) + break; + else + state = nextState; + } + + // take the useful portion, and the last non-reject state, and attempt to + // append characters that will match. + String nextString = nextString(s, state, pos); + if (nextString != null) { + return nextString; + } else { /* no more solutions exist from this useful portion, backtrack */ + if (pos == 0) /* all solutions exhausted */ + return null; + char nextChar = s.charAt(pos - 1); + nextChar++; + String sprime = s.substring(0, pos - 1) + nextChar; + // if this is accepted it is good to go as-is. + if (runAutomaton.run(sprime)) + return sprime; + else + s = sprime; + } + } + } + + /** + * Returns the next String in lexicographic order after s that will not put + * the machine into a reject state. Appends some characters to the useful + * portion. If this cannot satisfy the machine, returns null. This method will + * walk the minimal path, in lexicographic order, as long as possible. + * + * @param s input String + * @param state current non-reject state + * @param useful most useful portion of the string + * @return next valid String + */ + private String nextString(String s, org.apache.lucene.util.automaton.State state, int useful) { + /* + * the next lexicographic character must be greater than the existing + * character, if it exists. + */ + char c = 0; + if (useful < s.length()) { + c = s.charAt(useful); + c++; // cannot overflow as U+FFFF cannot be in the index. + } + + StringBuilder sb = new StringBuilder(); + // append the useful portion + sb.append(s, 0, useful); + + Set visited = new HashSet(); + visited.add(state); + + Transition transitions[] = getTransitions(state); + + // find the minimal path (lexicographic order) that is >= c + + for (int i = 0; i < transitions.length; i++) { + Transition transition = transitions[i]; + if (transition.getMax() >= c) { + char nextChar = (char) Math.max(c, transition.getMin()); + sb.append(nextChar); + state = transition.getDest(); + /* + * as long as is possible, continue down the minimal path in + * lexicographic order. if a loop or accept state is encountered, stop. + */ + while (!visited.contains(state) && !state.isAccept()) { + visited.add(state); + /* + * Note: we work with a DFA with no transitions to dead states. + * so the below is ok, if it is not an accept state, + * then there MUST be at least one transition. + */ + transition = getTransitions(state)[0]; + sb.append(transition.getMin()); + state = transition.getDest(); + } + return sb.toString(); + } + } + return null; + } + + /** + * Get the cached set of transitions for a state. + */ + private Transition[] getTransitions(org.apache.lucene.util.automaton.State state) { + return transitionCache.get(state); + } + + /** + * Step the state machine forward one character, + * using cached transitions. + */ + private org.apache.lucene.util.automaton.State step(org.apache.lucene.util.automaton.State state, char c) { + Transition transitions[] = getTransitions(state); + for (int i = 0; i < transitions.length; i++) + if (transitions[i].getMin() <= c && c <= transitions[i].getMax()) + return transitions[i].getDest(); + return null; + } +} Property changes on: src/java/org/apache/lucene/search/AutomatonTermsEnum.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/search/WildcardQuery.java =================================================================== --- src/java/org/apache/lucene/search/WildcardQuery.java (revision 883088) +++ src/java/org/apache/lucene/search/WildcardQuery.java (working copy) @@ -17,75 +17,67 @@ * limitations under the License. */ -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; import java.io.IOException; +import java.util.List; +import java.util.ArrayList; /** Implements the wildcard search query. Supported wildcards are *, which * matches any character sequence (including the empty one), and ?, * which matches any single character. Note this query can be slow, as it * needs to iterate over many terms. In order to prevent extremely slow WildcardQueries, - * a Wildcard term should not start with one of the wildcards * or - * ?. + * a Wildcard term should not start with the wildcard *. * *

This query uses the {@link * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} * rewrite method. * * @see WildcardTermEnums */ -public class WildcardQuery extends MultiTermQuery { - private boolean termContainsWildcard; - private boolean termIsPrefix; - protected Term term; - - public WildcardQuery(Term term) { - this.term = term; - String text = term.text(); - this.termContainsWildcard = (text.indexOf('*') != -1) - || (text.indexOf('?') != -1); - this.termIsPrefix = termContainsWildcard - && (text.indexOf('?') == -1) - && (text.indexOf('*') == text.length() - 1); - } - - // nocommit: needs singletermenum stuff - @Override - protected FilteredTermsEnum getTermsEnum(IndexReader reader) throws IOException { - //nocommit: handle singletermenum - return new WildcardTermsEnum(reader, getTerm()); - } - - // @deprecated see getTermsEnum - @Override - protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { - if (termContainsWildcard) - return new WildcardTermEnum(reader, getTerm()); - else - return new SingleTermEnum(reader, getTerm()); - } - - /** - * Returns the pattern term. - */ - public Term getTerm() { - return term; - } + public class WildcardQuery extends AutomatonQuery { + + /** String equality with support for wildcards */ + public static final char WILDCARD_STRING = '*'; + + /** Char equality with support for wildcards */ + public static final char WILDCARD_CHAR = '?'; + + /** + * Constructs a query for terms matching term. + */ + public WildcardQuery(Term term) { + super(term, toAutomaton(term)); + } - @Override - public Query rewrite(IndexReader reader) throws IOException { - if (termIsPrefix) { - MultiTermQuery rewritten = new PrefixQuery(term.createTerm(term.text() - .substring(0, term.text().indexOf('*')))); - rewritten.setBoost(getBoost()); - rewritten.setRewriteMethod(getRewriteMethod()); - return rewritten; - } else { - return super.rewrite(reader); - } - } - + /** + * Convert Lucene wildcard syntax into an automaton. + */ + static Automaton toAutomaton(Term wildcardquery) { + List automata = new ArrayList(); + + String wildcardText = wildcardquery.text(); + + for (int i = 0; i < wildcardText.length(); i++) { + final char c = wildcardText.charAt(i); + switch(c) { + case WILDCARD_STRING: + automata.add(BasicAutomata.makeAnyString()); + break; + case WILDCARD_CHAR: + automata.add(BasicAutomata.makeAnyChar()); + break; + default: + automata.add(BasicAutomata.makeChar(c)); + } + } + + return BasicOperations.concatenate(automata); + } + /** Prints a user-readable version of this query. */ @Override public String toString(String field) { @@ -98,30 +90,4 @@ buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } - - @Override - public int hashCode() { - final int prime = 31; - int result = super.hashCode(); - result = prime * result + ((term == null) ? 0 : term.hashCode()); - return result; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (!super.equals(obj)) - return false; - if (getClass() != obj.getClass()) - return false; - WildcardQuery other = (WildcardQuery) obj; - if (term == null) { - if (other.term != null) - return false; - } else if (!term.equals(other.term)) - return false; - return true; - } - } Index: src/java/org/apache/lucene/search/AutomatonQuery.java =================================================================== --- src/java/org/apache/lucene/search/AutomatonQuery.java (revision 0) +++ src/java/org/apache/lucene/search/AutomatonQuery.java (revision 0) @@ -0,0 +1,167 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FilteredTermEnum; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.SingleTermEnum; +import org.apache.lucene.util.ToStringUtils; + +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.MinimizationOperations; +import org.apache.lucene.util.automaton.SpecialOperations; + +/** + * A {@link Query} that will match terms against a finite-state machine. + *

+ * This query will match documents that contain terms accepted by a given + * finite-state machine. The automaton can be constructed with the + * {@link org.apache.lucene.util.automaton} API. Alternatively, it can be + * created from a regular expression with {@link RegexpQuery} or from + * the standard Lucene wildcard syntax with {@link WildcardQuery}. + *

+ *

+ * When the query is executed, it will create an equivalent minimal DFA of the + * finite-state machine, and will enumerate the term dictionary in an + * intelligent way to reduce the number of comparisons. For example: the regular + * expression of [dl]og? will make approximately four comparisons: + * do, dog, lo, and log. + *

+ */ +public class AutomatonQuery extends MultiTermQuery { + /** the automaton to match index terms against */ + protected Automaton automaton; + /** term containing the field, and possibly some pattern structure */ + protected Term term; + + /** + * Create a new AutomatonQuery from an {@link Automaton}. + * + * @param term Term containing field and possibly some pattern structure. The + * term text is ignored. + * @param automaton Automaton to run, terms that are accepted are considered a + * match. + */ + public AutomatonQuery(Term term, Automaton automaton) { + super(); + this.term = term; + this.automaton = automaton; + MinimizationOperations.minimize(automaton); + } + + @Override + protected FilteredTermsEnum getTermsEnum(IndexReader reader) throws IOException { + // matches a fixed string in expanded or singleton representation + + // nocommit -- handle + /* + String commonPrefix = SpecialOperations.getCommonPrefix(automaton); + if (automaton.equals(BasicAutomata.makeString(commonPrefix))) { + return new SingleTermEnum(reader, term.createTerm(commonPrefix)); + } + */ + + return new AutomatonTermsEnum(automaton, term, reader); + } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + // matches a fixed string in expanded or singleton representation + String commonPrefix = SpecialOperations.getCommonPrefix(automaton); + if (automaton.equals(BasicAutomata.makeString(commonPrefix))) + return new SingleTermEnum(reader, term.createTerm(commonPrefix)); + + return new AutomatonTermEnum(automaton, term, reader); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + /* + * It is a prefix query, if it accepts the same language as its common + * prefix appended with any possible string. + */ + + String commonPrefix = SpecialOperations.getCommonPrefix(automaton); + Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata + .makeString(commonPrefix), BasicAutomata.makeAnyString()); + + if (automaton.equals(prefixAutomaton)) { + PrefixQuery rewritten = new PrefixQuery(term.createTerm(commonPrefix)); + rewritten.setBoost(getBoost()); + rewritten.setRewriteMethod(getRewriteMethod()); + return rewritten; + } + + return super.rewrite(reader); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + ((automaton == null) ? 0 : automaton.hashCode()); + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (!super.equals(obj)) + return false; + if (getClass() != obj.getClass()) + return false; + AutomatonQuery other = (AutomatonQuery) obj; + if (automaton == null) { + if (other.automaton != null) + return false; + } else if (!automaton.equals(other.automaton)) + return false; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } + + @Override + public String toString(String field) { + StringBuilder buffer = new StringBuilder(); + if (!term.field().equals(field)) { + buffer.append(term.field()); + buffer.append(":"); + } + buffer.append(getClass().getSimpleName()); + buffer.append(" {"); + buffer.append('\n'); + buffer.append(automaton.toString()); + buffer.append("}"); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } +} Property changes on: src/java/org/apache/lucene/search/AutomatonQuery.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/State.java =================================================================== --- src/java/org/apache/lucene/util/automaton/State.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/State.java (revision 0) @@ -0,0 +1,202 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Automaton state. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class State implements Serializable, Comparable { + + static final long serialVersionUID = 30001; + + boolean accept; + Set transitions; + + int number; + + int id; + static int next_id; + + /** + * Constructs a new state. Initially, the new state is a reject state. + */ + public State() { + resetTransitions(); + id = next_id++; + } + + /** + * Resets transition set. + */ + final void resetTransitions() { + transitions = new HashSet(); + } + + /** + * Returns the set of outgoing transitions. Subsequent changes are reflected + * in the automaton. + * + * @return transition set + */ + public Set getTransitions() { + return transitions; + } + + /** + * Adds an outgoing transition. + * + * @param t transition + */ + public void addTransition(Transition t) { + transitions.add(t); + } + + /** + * Sets acceptance for this state. + * + * @param accept if true, this state is an accept state + */ + public void setAccept(boolean accept) { + this.accept = accept; + } + + /** + * Returns acceptance status. + * + * @return true is this is an accept state + */ + public boolean isAccept() { + return accept; + } + + /** + * Performs lookup in transitions, assuming determinism. + * + * @param c character to look up + * @return destination state, null if no matching outgoing transition + * @see #step(char, Collection) + */ + public State step(char c) { + for (Transition t : transitions) + if (t.min <= c && c <= t.max) return t.to; + return null; + } + + /** + * Performs lookup in transitions, allowing nondeterminism. + * + * @param c character to look up + * @param dest collection where destination states are stored + * @see #step(char) + */ + public void step(char c, Collection dest) { + for (Transition t : transitions) + if (t.min <= c && c <= t.max) dest.add(t.to); + } + + void addEpsilon(State to) { + if (to.accept) accept = true; + for (Transition t : to.transitions) + transitions.add(t); + } + + /** + * Returns transitions sorted by (min, reverse max, to) or (to, min, reverse + * max) + */ + Transition[] getSortedTransitionArray(boolean to_first) { + Transition[] e = transitions.toArray(new Transition[transitions.size()]); + Arrays.sort(e, new TransitionComparator(to_first)); + return e; + } + + /** + * Returns sorted list of outgoing transitions. + * + * @param to_first if true, order by (to, min, reverse max); otherwise (min, + * reverse max, to) + * @return transition list + */ + public List getSortedTransitions(boolean to_first) { + return Arrays.asList(getSortedTransitionArray(to_first)); + } + + /** + * Returns string describing this state. Normally invoked via + * {@link Automaton#toString()}. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append("state ").append(number); + if (accept) b.append(" [accept]"); + else b.append(" [reject]"); + b.append(":\n"); + for (Transition t : transitions) + b.append(" ").append(t.toString()).append("\n"); + return b.toString(); + } + + /** + * Compares this object with the specified object for order. States are + * ordered by the time of construction. + */ + public int compareTo(State s) { + return s.id - id; + } + + /** + * See {@link java.lang.Object#equals(java.lang.Object)}. + */ + @Override + public boolean equals(Object obj) { + return super.equals(obj); + } + + /** + * See {@link java.lang.Object#hashCode()}. + */ + @Override + public int hashCode() { + return super.hashCode(); + } +} Property changes on: src/java/org/apache/lucene/util/automaton/State.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/TransitionComparator.java =================================================================== --- src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 0) @@ -0,0 +1,80 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; +import java.util.Comparator; + +/** + * Comparator for state {@link Transition}s that orders unicode char range + * transitions in lexicographic order. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +class TransitionComparator implements Comparator, Serializable { + + static final long serialVersionUID = 10001; + + boolean to_first; + + TransitionComparator(boolean to_first) { + this.to_first = to_first; + } + + /** + * Compares by (min, reverse max, to) or (to, min, reverse max). + */ + public int compare(Transition t1, Transition t2) { + if (to_first) { + if (t1.to != t2.to) { + if (t1.to == null) return -1; + else if (t2.to == null) return 1; + else if (t1.to.number < t2.to.number) return -1; + else if (t1.to.number > t2.to.number) return 1; + } + } + if (t1.min < t2.min) return -1; + if (t1.min > t2.min) return 1; + if (t1.max > t2.max) return -1; + if (t1.max < t2.max) return 1; + if (!to_first) { + if (t1.to != t2.to) { + if (t1.to == null) return -1; + else if (t2.to == null) return 1; + else if (t1.to.number < t2.to.number) return -1; + else if (t1.to.number > t2.to.number) return 1; + } + } + return 0; + } +} Property changes on: src/java/org/apache/lucene/util/automaton/TransitionComparator.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/AutomatonProvider.java =================================================================== --- src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (revision 0) @@ -0,0 +1,53 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.IOException; + +/** + * Automaton provider for RegExp. + * {@link RegExp#toAutomaton(AutomatonProvider)} + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public interface AutomatonProvider { + + /** + * Returns automaton of the given name. + * + * @param name automaton name + * @return automaton + * @throws IOException if errors occur + */ + public Automaton getAutomaton(String name) throws IOException; +} Property changes on: src/java/org/apache/lucene/util/automaton/AutomatonProvider.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/SpecialOperations.java =================================================================== --- src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 0) @@ -0,0 +1,118 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.HashSet; + +/** + * Special automata operations. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +final public class SpecialOperations { + + private SpecialOperations() {} + + /** + * Finds the largest entry whose value is less than or equal to c, or 0 if + * there is no such entry. + */ + static int findIndex(char c, char[] points) { + int a = 0; + int b = points.length; + while (b - a > 1) { + int d = (a + b) >>> 1; + if (points[d] > c) b = d; + else if (points[d] < c) a = d; + else return d; + } + return a; + } + + /** + * Returns true if the language of this automaton is finite. + */ + public static boolean isFinite(Automaton a) { + if (a.isSingleton()) return true; + return isFinite(a.initial, new HashSet()); + } + + /** + * Checks whether there is a loop containing s. (This is sufficient since + * there are never transitions to dead states.) + */ + private static boolean isFinite(State s, HashSet path) { + path.add(s); + for (Transition t : s.transitions) + if (path.contains(t.to) || !isFinite(t.to, path)) return false; + path.remove(s); + return true; + } + + /** + * Returns the longest string that is a prefix of all accepted strings and + * visits each state at most once. + * + * @return common prefix + */ + public static String getCommonPrefix(Automaton a) { + if (a.isSingleton()) return a.singleton; + StringBuilder b = new StringBuilder(); + HashSet visited = new HashSet(); + State s = a.initial; + boolean done; + do { + done = true; + visited.add(s); + if (!s.accept && s.transitions.size() == 1) { + Transition t = s.transitions.iterator().next(); + if (t.min == t.max && !visited.contains(t.to)) { + b.append(t.min); + s = t.to; + done = false; + } + } + } while (!done); + return b.toString(); + } + + /** + * Prefix closes the given automaton. + */ + public static void prefixClose(Automaton a) { + for (State s : a.getStates()) + s.setAccept(true); + a.clearHashCode(); + a.checkMinimizeAlways(); + } +} Property changes on: src/java/org/apache/lucene/util/automaton/SpecialOperations.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/BasicOperations.java =================================================================== --- src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 0) @@ -0,0 +1,624 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Basic automata operations. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +final public class BasicOperations { + + private BasicOperations() {} + + /** + * Returns an automaton that accepts the concatenation of the languages of the + * given automata. + *

+ * Complexity: linear in number of states. + */ + static public Automaton concatenate(Automaton a1, Automaton a2) { + if (a1.isSingleton() && a2.isSingleton()) return BasicAutomata + .makeString(a1.singleton + a2.singleton); + if (a1 == a2) { + a1 = a1.cloneExpanded(); + a2 = a2.cloneExpanded(); + } else { + a1 = a1.cloneExpandedIfRequired(); + a2 = a2.cloneExpandedIfRequired(); + } + for (State s : a1.getAcceptStates()) { + s.accept = false; + s.addEpsilon(a2.initial); + } + a1.deterministic = false; + a1.clearHashCode(); + a1.checkMinimizeAlways(); + return a1; + } + + /** + * Returns an automaton that accepts the concatenation of the languages of the + * given automata. + *

+ * Complexity: linear in total number of states. + */ + static public Automaton concatenate(List l) { + if (l.isEmpty()) return BasicAutomata.makeEmptyString(); + boolean all_singleton = true; + for (Automaton a : l) + if (!a.isSingleton()) { + all_singleton = false; + break; + } + if (all_singleton) { + StringBuilder b = new StringBuilder(); + for (Automaton a : l) + b.append(a.singleton); + return BasicAutomata.makeString(b.toString()); + } else { + for (Automaton a : l) + if (BasicOperations.isEmpty(a)) return BasicAutomata.makeEmpty(); + Set ids = new HashSet(); + for (Automaton a : l) + ids.add(System.identityHashCode(a)); + boolean has_aliases = ids.size() != l.size(); + Automaton b = l.get(0); + if (has_aliases) b = b.cloneExpanded(); + else b = b.cloneExpandedIfRequired(); + Set ac = b.getAcceptStates(); + boolean first = true; + for (Automaton a : l) + if (first) first = false; + else { + if (a.isEmptyString()) continue; + Automaton aa = a; + if (has_aliases) aa = aa.cloneExpanded(); + else aa = aa.cloneExpandedIfRequired(); + Set ns = aa.getAcceptStates(); + for (State s : ac) { + s.accept = false; + s.addEpsilon(aa.initial); + if (s.accept) ns.add(s); + } + ac = ns; + } + b.deterministic = false; + b.clearHashCode(); + b.checkMinimizeAlways(); + return b; + } + } + + /** + * Returns an automaton that accepts the union of the empty string and the + * language of the given automaton. + *

+ * Complexity: linear in number of states. + */ + static public Automaton optional(Automaton a) { + a = a.cloneExpandedIfRequired(); + State s = new State(); + s.addEpsilon(a.initial); + s.accept = true; + a.initial = s; + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + return a; + } + + /** + * Returns an automaton that accepts the Kleene star (zero or more + * concatenated repetitions) of the language of the given automaton. Never + * modifies the input automaton language. + *

+ * Complexity: linear in number of states. + */ + static public Automaton repeat(Automaton a) { + a = a.cloneExpanded(); + State s = new State(); + s.accept = true; + s.addEpsilon(a.initial); + for (State p : a.getAcceptStates()) + p.addEpsilon(s); + a.initial = s; + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + return a; + } + + /** + * Returns an automaton that accepts min or more concatenated + * repetitions of the language of the given automaton. + *

+ * Complexity: linear in number of states and in min. + */ + static public Automaton repeat(Automaton a, int min) { + if (min == 0) return repeat(a); + List as = new ArrayList(); + while (min-- > 0) + as.add(a); + as.add(repeat(a)); + return concatenate(as); + } + + /** + * Returns an automaton that accepts between min and + * max (including both) concatenated repetitions of the language + * of the given automaton. + *

+ * Complexity: linear in number of states and in min and + * max. + */ + static public Automaton repeat(Automaton a, int min, int max) { + if (min > max) return BasicAutomata.makeEmpty(); + max -= min; + a.expandSingleton(); + Automaton b; + if (min == 0) b = BasicAutomata.makeEmptyString(); + else if (min == 1) b = a.clone(); + else { + List as = new ArrayList(); + while (min-- > 0) + as.add(a); + b = concatenate(as); + } + if (max > 0) { + Automaton d = a.clone(); + while (--max > 0) { + Automaton c = a.clone(); + for (State p : c.getAcceptStates()) + p.addEpsilon(d.initial); + d = c; + } + for (State p : b.getAcceptStates()) + p.addEpsilon(d.initial); + b.deterministic = false; + b.clearHashCode(); + b.checkMinimizeAlways(); + } + return b; + } + + /** + * Returns a (deterministic) automaton that accepts the complement of the + * language of the given automaton. + *

+ * Complexity: linear in number of states (if already deterministic). + */ + static public Automaton complement(Automaton a) { + a = a.cloneExpandedIfRequired(); + a.determinize(); + a.totalize(); + for (State p : a.getStates()) + p.accept = !p.accept; + a.removeDeadTransitions(); + return a; + } + + /** + * Returns a (deterministic) automaton that accepts the intersection of the + * language of a1 and the complement of the language of + * a2. As a side-effect, the automata may be determinized, if not + * already deterministic. + *

+ * Complexity: quadratic in number of states (if already deterministic). + */ + static public Automaton minus(Automaton a1, Automaton a2) { + if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata + .makeEmpty(); + if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired(); + if (a1.isSingleton()) { + if (a2.run(a1.singleton)) return BasicAutomata.makeEmpty(); + else return a1.cloneIfRequired(); + } + return intersection(a1, a2.complement()); + } + + /** + * Returns an automaton that accepts the intersection of the languages of the + * given automata. Never modifies the input automata languages. + *

+ * Complexity: quadratic in number of states. + */ + static public Automaton intersection(Automaton a1, Automaton a2) { + if (a1.isSingleton()) { + if (a2.run(a1.singleton)) return a1.cloneIfRequired(); + else return BasicAutomata.makeEmpty(); + } + if (a2.isSingleton()) { + if (a1.run(a2.singleton)) return a2.cloneIfRequired(); + else return BasicAutomata.makeEmpty(); + } + if (a1 == a2) return a1.cloneIfRequired(); + Transition[][] transitions1 = Automaton + .getSortedTransitions(a1.getStates()); + Transition[][] transitions2 = Automaton + .getSortedTransitions(a2.getStates()); + Automaton c = new Automaton(); + LinkedList worklist = new LinkedList(); + HashMap newstates = new HashMap(); + StatePair p = new StatePair(c.initial, a1.initial, a2.initial); + worklist.add(p); + newstates.put(p, p); + while (worklist.size() > 0) { + p = worklist.removeFirst(); + p.s.accept = p.s1.accept && p.s2.accept; + Transition[] t1 = transitions1[p.s1.number]; + Transition[] t2 = transitions2[p.s2.number]; + for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { + while (b2 < t2.length && t2[b2].max < t1[n1].min) + b2++; + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) + if (t2[n2].max >= t1[n1].min) { + StatePair q = new StatePair(t1[n1].to, t2[n2].to); + StatePair r = newstates.get(q); + if (r == null) { + q.s = new State(); + worklist.add(q); + newstates.put(q, q); + r = q; + } + char min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; + char max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; + p.s.transitions.add(new Transition(min, max, r.s)); + } + } + } + c.deterministic = a1.deterministic && a2.deterministic; + c.removeDeadTransitions(); + c.checkMinimizeAlways(); + return c; + } + + /** + * Returns true if the language of a1 is a subset of the language + * of a2. As a side-effect, a2 is determinized if + * not already marked as deterministic. + *

+ * Complexity: quadratic in number of states. + */ + public static boolean subsetOf(Automaton a1, Automaton a2) { + if (a1 == a2) return true; + if (a1.isSingleton()) { + if (a2.isSingleton()) return a1.singleton.equals(a2.singleton); + return a2.run(a1.singleton); + } + a2.determinize(); + Transition[][] transitions1 = Automaton + .getSortedTransitions(a1.getStates()); + Transition[][] transitions2 = Automaton + .getSortedTransitions(a2.getStates()); + LinkedList worklist = new LinkedList(); + HashSet visited = new HashSet(); + StatePair p = new StatePair(a1.initial, a2.initial); + worklist.add(p); + visited.add(p); + while (worklist.size() > 0) { + p = worklist.removeFirst(); + if (p.s1.accept && !p.s2.accept) return false; + Transition[] t1 = transitions1[p.s1.number]; + Transition[] t2 = transitions2[p.s2.number]; + for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { + while (b2 < t2.length && t2[b2].max < t1[n1].min) + b2++; + int min1 = t1[n1].min, max1 = t1[n1].max; + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { + if (t2[n2].min > min1) return false; + if (t2[n2].max < Character.MAX_VALUE) min1 = t2[n2].max + 1; + else { + min1 = Character.MAX_VALUE; + max1 = Character.MIN_VALUE; + } + StatePair q = new StatePair(t1[n1].to, t2[n2].to); + if (!visited.contains(q)) { + worklist.add(q); + visited.add(q); + } + } + if (min1 <= max1) return false; + } + } + return true; + } + + /** + * Returns an automaton that accepts the union of the languages of the given + * automata. + *

+ * Complexity: linear in number of states. + */ + public static Automaton union(Automaton a1, Automaton a2) { + if ((a1.isSingleton() && a2.isSingleton() && a1.singleton + .equals(a2.singleton)) + || a1 == a2) return a1.cloneIfRequired(); + if (a1 == a2) { + a1 = a1.cloneExpanded(); + a2 = a2.cloneExpanded(); + } else { + a1 = a1.cloneExpandedIfRequired(); + a2 = a2.cloneExpandedIfRequired(); + } + State s = new State(); + s.addEpsilon(a1.initial); + s.addEpsilon(a2.initial); + a1.initial = s; + a1.deterministic = false; + a1.clearHashCode(); + a1.checkMinimizeAlways(); + return a1; + } + + /** + * Returns an automaton that accepts the union of the languages of the given + * automata. + *

+ * Complexity: linear in number of states. + */ + public static Automaton union(Collection l) { + Set ids = new HashSet(); + for (Automaton a : l) + ids.add(System.identityHashCode(a)); + boolean has_aliases = ids.size() != l.size(); + State s = new State(); + for (Automaton b : l) { + if (BasicOperations.isEmpty(b)) continue; + Automaton bb = b; + if (has_aliases) bb = bb.cloneExpanded(); + else bb = bb.cloneExpandedIfRequired(); + s.addEpsilon(bb.initial); + } + Automaton a = new Automaton(); + a.initial = s; + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + return a; + } + + /** + * Determinizes the given automaton. + *

+ * Complexity: exponential in number of states. + */ + public static void determinize(Automaton a) { + if (a.deterministic || a.isSingleton()) return; + Set initialset = new HashSet(); + initialset.add(a.initial); + determinize(a, initialset); + } + + /** + * Determinizes the given automaton using the given set of initial states. + */ + static void determinize(Automaton a, Set initialset) { + char[] points = a.getStartPoints(); + // subset construction + Map,Set> sets = new HashMap,Set>(); + LinkedList> worklist = new LinkedList>(); + Map,State> newstate = new HashMap,State>(); + sets.put(initialset, initialset); + worklist.add(initialset); + a.initial = new State(); + newstate.put(initialset, a.initial); + while (worklist.size() > 0) { + Set s = worklist.removeFirst(); + State r = newstate.get(s); + for (State q : s) + if (q.accept) { + r.accept = true; + break; + } + for (int n = 0; n < points.length; n++) { + Set p = new HashSet(); + for (State q : s) + for (Transition t : q.transitions) + if (t.min <= points[n] && points[n] <= t.max) p.add(t.to); + if (!sets.containsKey(p)) { + sets.put(p, p); + worklist.add(p); + newstate.put(p, new State()); + } + State q = newstate.get(p); + char min = points[n]; + char max; + if (n + 1 < points.length) max = (char) (points[n + 1] - 1); + else max = Character.MAX_VALUE; + r.transitions.add(new Transition(min, max, q)); + } + } + a.deterministic = true; + a.removeDeadTransitions(); + } + + /** + * Adds epsilon transitions to the given automaton. This method adds extra + * character interval transitions that are equivalent to the given set of + * epsilon transitions. + * + * @param pairs collection of {@link StatePair} objects representing pairs of + * source/destination states where epsilon transitions should be + * added + */ + public static void addEpsilons(Automaton a, Collection pairs) { + a.expandSingleton(); + HashMap> forward = new HashMap>(); + HashMap> back = new HashMap>(); + for (StatePair p : pairs) { + HashSet to = forward.get(p.s1); + if (to == null) { + to = new HashSet(); + forward.put(p.s1, to); + } + to.add(p.s2); + HashSet from = back.get(p.s2); + if (from == null) { + from = new HashSet(); + back.put(p.s2, from); + } + from.add(p.s1); + } + // calculate epsilon closure + LinkedList worklist = new LinkedList(pairs); + HashSet workset = new HashSet(pairs); + while (!worklist.isEmpty()) { + StatePair p = worklist.removeFirst(); + workset.remove(p); + HashSet to = forward.get(p.s2); + HashSet from = back.get(p.s1); + if (to != null) { + for (State s : to) { + StatePair pp = new StatePair(p.s1, s); + if (!pairs.contains(pp)) { + pairs.add(pp); + forward.get(p.s1).add(s); + back.get(s).add(p.s1); + worklist.add(pp); + workset.add(pp); + if (from != null) { + for (State q : from) { + StatePair qq = new StatePair(q, p.s1); + if (!workset.contains(qq)) { + worklist.add(qq); + workset.add(qq); + } + } + } + } + } + } + } + // add transitions + for (StatePair p : pairs) + p.s1.addEpsilon(p.s2); + a.deterministic = false; + a.clearHashCode(); + a.checkMinimizeAlways(); + } + + /** + * Returns true if the given automaton accepts the empty string and nothing + * else. + */ + public static boolean isEmptyString(Automaton a) { + if (a.isSingleton()) return a.singleton.length() == 0; + else return a.initial.accept && a.initial.transitions.isEmpty(); + } + + /** + * Returns true if the given automaton accepts no strings. + */ + public static boolean isEmpty(Automaton a) { + if (a.isSingleton()) return false; + return !a.initial.accept && a.initial.transitions.isEmpty(); + } + + /** + * Returns true if the given automaton accepts all strings. + */ + public static boolean isTotal(Automaton a) { + if (a.isSingleton()) return false; + if (a.initial.accept && a.initial.transitions.size() == 1) { + Transition t = a.initial.transitions.iterator().next(); + return t.to == a.initial && t.min == Character.MIN_VALUE + && t.max == Character.MAX_VALUE; + } + return false; + } + + /** + * Returns true if the given string is accepted by the automaton. + *

+ * Complexity: linear in the length of the string. + *

+ * Note: for full performance, use the {@link RunAutomaton} class. + */ + public static boolean run(Automaton a, String s) { + if (a.isSingleton()) return s.equals(a.singleton); + if (a.deterministic) { + State p = a.initial; + for (int i = 0; i < s.length(); i++) { + State q = p.step(s.charAt(i)); + if (q == null) return false; + p = q; + } + return p.accept; + } else { + Set states = a.getStates(); + Automaton.setStateNumbers(states); + LinkedList pp = new LinkedList(); + LinkedList pp_other = new LinkedList(); + BitSet bb = new BitSet(states.size()); + BitSet bb_other = new BitSet(states.size()); + pp.add(a.initial); + ArrayList dest = new ArrayList(); + boolean accept = a.initial.accept; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + accept = false; + pp_other.clear(); + bb_other.clear(); + for (State p : pp) { + dest.clear(); + p.step(c, dest); + for (State q : dest) { + if (q.accept) accept = true; + if (!bb_other.get(q.number)) { + bb_other.set(q.number); + pp_other.add(q); + } + } + } + LinkedList tp = pp; + pp = pp_other; + pp_other = tp; + BitSet tb = bb; + bb = bb_other; + bb_other = tb; + } + return accept; + } + } +} Property changes on: src/java/org/apache/lucene/util/automaton/BasicOperations.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/Automaton.java =================================================================== --- src/java/org/apache/lucene/util/automaton/Automaton.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/Automaton.java (revision 0) @@ -0,0 +1,819 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InvalidClassException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.OptionalDataException; +import java.io.OutputStream; +import java.io.Serializable; +import java.net.URL; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +/** + * Finite-state automaton with regular expression operations. + *

+ * Class invariants: + *

    + *
  • An automaton is either represented explicitly (with {@link State} and + * {@link Transition} objects) or with a singleton string (see + * {@link #getSingleton()} and {@link #expandSingleton()}) in case the automaton + * is known to accept exactly one string. (Implicitly, all states and + * transitions of an automaton are reachable from its initial state.) + *
  • Automata are always reduced (see {@link #reduce()}) and have no + * transitions to dead states (see {@link #removeDeadTransitions()}). + *
  • If an automaton is nondeterministic, then {@link #isDeterministic()} + * returns false (but the converse is not required). + *
  • Automata provided as input to operations are generally assumed to be + * disjoint. + *
+ *

+ * If the states or transitions are manipulated manually, the + * {@link #restoreInvariant()} and {@link #setDeterministic(boolean)} methods + * should be used afterwards to restore representation invariants that are + * assumed by the built-in automata operations. + * + *

+ * WARNING: The status of the Automaton feature is + * experimental. The APIs introduced here might change in the future and will + * not be supported anymore in such a case. + */ +public class Automaton implements Serializable, Cloneable { + + static final long serialVersionUID = 10001; + + /** + * Minimize using Hopcroft's O(n log n) algorithm. This is regarded as one of + * the most generally efficient algorithms that exist. + * + * @see #setMinimization(int) + */ + public static final int MINIMIZE_HOPCROFT = 2; + + /** Selects minimization algorithm (default: MINIMIZE_HOPCROFT). */ + static int minimization = MINIMIZE_HOPCROFT; + + /** Initial state of this automaton. */ + State initial; + + /** + * If true, then this automaton is definitely deterministic (i.e., there are + * no choices for any run, but a run may crash). + */ + boolean deterministic; + + /** Extra data associated with this automaton. */ + transient Object info; + + /** + * Hash code. Recomputed by {@link MinimizationOperations#minimize(Automaton)} + */ + int hash_code; + + /** Singleton string. Null if not applicable. */ + String singleton; + + /** Minimize always flag. */ + static boolean minimize_always = false; + + /** + * Selects whether operations may modify the input automata (default: + * false). + */ + static boolean allow_mutation = false; + + /** + * Constructs a new automaton that accepts the empty language. Using this + * constructor, automata can be constructed manually from {@link State} and + * {@link Transition} objects. + * + * @see #setInitialState(State) + * @see State + * @see Transition + */ + public Automaton() { + initial = new State(); + deterministic = true; + singleton = null; + } + + boolean isDebug() { + return System.getProperty("dk.brics.automaton.debug") != null; + } + + /** + * Selects minimization algorithm (default: MINIMIZE_HOPCROFT). + * + * @param algorithm minimization algorithm + */ + static public void setMinimization(int algorithm) { + minimization = algorithm; + } + + /** + * Sets or resets minimize always flag. If this flag is set, then + * {@link MinimizationOperations#minimize(Automaton)} will automatically be + * invoked after all operations that otherwise may produce non-minimal + * automata. By default, the flag is not set. + * + * @param flag if true, the flag is set + */ + static public void setMinimizeAlways(boolean flag) { + minimize_always = flag; + } + + /** + * Sets or resets allow mutate flag. If this flag is set, then all automata + * operations may modify automata given as input; otherwise, operations will + * always leave input automata languages unmodified. By default, the flag is + * not set. + * + * @param flag if true, the flag is set + * @return previous value of the flag + */ + static public boolean setAllowMutate(boolean flag) { + boolean b = allow_mutation; + allow_mutation = flag; + return b; + } + + /** + * Returns the state of the allow mutate flag. If this flag is set, then all + * automata operations may modify automata given as input; otherwise, + * operations will always leave input automata languages unmodified. By + * default, the flag is not set. + * + * @return current value of the flag + */ + static boolean getAllowMutate() { + return allow_mutation; + } + + void checkMinimizeAlways() { + if (minimize_always) MinimizationOperations.minimize(this); + } + + boolean isSingleton() { + return singleton != null; + } + + /** + * Returns the singleton string for this automaton. An automaton that accepts + * exactly one string may be represented in singleton mode. In that + * case, this method may be used to obtain the string. + * + * @return string, null if this automaton is not in singleton mode. + */ + public String getSingleton() { + return singleton; + } + + /** + * Sets initial state. + * + * @param s state + */ + public void setInitialState(State s) { + initial = s; + singleton = null; + } + + /** + * Gets initial state. + * + * @return state + */ + public State getInitialState() { + expandSingleton(); + return initial; + } + + /** + * Returns deterministic flag for this automaton. + * + * @return true if the automaton is definitely deterministic, false if the + * automaton may be nondeterministic + */ + public boolean isDeterministic() { + return deterministic; + } + + /** + * Sets deterministic flag for this automaton. This method should (only) be + * used if automata are constructed manually. + * + * @param deterministic true if the automaton is definitely deterministic, + * false if the automaton may be nondeterministic + */ + public void setDeterministic(boolean deterministic) { + this.deterministic = deterministic; + } + + /** + * Associates extra information with this automaton. + * + * @param info extra information + */ + public void setInfo(Object info) { + this.info = info; + } + + /** + * Returns extra information associated with this automaton. + * + * @return extra information + * @see #setInfo(Object) + */ + public Object getInfo() { + return info; + } + + /** + * Returns the set of states that are reachable from the initial state. + * + * @return set of {@link State} objects + */ + public Set getStates() { + expandSingleton(); + Set visited; + if (isDebug()) visited = new LinkedHashSet(); + else visited = new HashSet(); + LinkedList worklist = new LinkedList(); + worklist.add(initial); + visited.add(initial); + while (worklist.size() > 0) { + State s = worklist.removeFirst(); + Collection tr; + if (isDebug()) tr = s.getSortedTransitions(false); + else tr = s.transitions; + for (Transition t : tr) + if (!visited.contains(t.to)) { + visited.add(t.to); + worklist.add(t.to); + } + } + return visited; + } + + /** + * Returns the set of reachable accept states. + * + * @return set of {@link State} objects + */ + public Set getAcceptStates() { + expandSingleton(); + HashSet accepts = new HashSet(); + HashSet visited = new HashSet(); + LinkedList worklist = new LinkedList(); + worklist.add(initial); + visited.add(initial); + while (worklist.size() > 0) { + State s = worklist.removeFirst(); + if (s.accept) accepts.add(s); + for (Transition t : s.transitions) + if (!visited.contains(t.to)) { + visited.add(t.to); + worklist.add(t.to); + } + } + return accepts; + } + + /** + * Assigns consecutive numbers to the given states. + */ + static void setStateNumbers(Set states) { + int number = 0; + for (State s : states) + s.number = number++; + } + + /** + * Adds transitions to explicit crash state to ensure that transition function + * is total. + */ + void totalize() { + State s = new State(); + s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + s)); + for (State p : getStates()) { + int maxi = Character.MIN_VALUE; + for (Transition t : p.getSortedTransitions(false)) { + if (t.min > maxi) p.transitions.add(new Transition((char) maxi, + (char) (t.min - 1), s)); + if (t.max + 1 > maxi) maxi = t.max + 1; + } + if (maxi <= Character.MAX_VALUE) p.transitions.add(new Transition( + (char) maxi, Character.MAX_VALUE, s)); + } + } + + /** + * Restores representation invariant. This method must be invoked before any + * built-in automata operation is performed if automaton states or transitions + * are manipulated manually. + * + * @see #setDeterministic(boolean) + */ + public void restoreInvariant() { + removeDeadTransitions(); + } + + /** + * Reduces this automaton. An automaton is "reduced" by combining overlapping + * and adjacent edge intervals with same destination. + */ + public void reduce() { + if (isSingleton()) return; + Set states = getStates(); + setStateNumbers(states); + for (State s : states) { + List st = s.getSortedTransitions(true); + s.resetTransitions(); + State p = null; + int min = -1, max = -1; + for (Transition t : st) { + if (p == t.to) { + if (t.min <= max + 1) { + if (t.max > max) max = t.max; + } else { + if (p != null) s.transitions.add(new Transition((char) min, + (char) max, p)); + min = t.min; + max = t.max; + } + } else { + if (p != null) s.transitions.add(new Transition((char) min, + (char) max, p)); + p = t.to; + min = t.min; + max = t.max; + } + } + if (p != null) s.transitions + .add(new Transition((char) min, (char) max, p)); + } + } + + /** + * Returns sorted array of all interval start points. + */ + char[] getStartPoints() { + Set pointset = new HashSet(); + for (State s : getStates()) { + pointset.add(Character.MIN_VALUE); + for (Transition t : s.transitions) { + pointset.add(t.min); + if (t.max < Character.MAX_VALUE) pointset.add((char) (t.max + 1)); + } + } + char[] points = new char[pointset.size()]; + int n = 0; + for (Character m : pointset) + points[n++] = m; + Arrays.sort(points); + return points; + } + + /** + * Returns the set of live states. A state is "live" if an accept state is + * reachable from it. + * + * @return set of {@link State} objects + */ + public Set getLiveStates() { + expandSingleton(); + return getLiveStates(getStates()); + } + + private Set getLiveStates(Set states) { + HashMap> map = new HashMap>(); + for (State s : states) + map.put(s, new HashSet()); + for (State s : states) + for (Transition t : s.transitions) + map.get(t.to).add(s); + Set live = new HashSet(getAcceptStates()); + LinkedList worklist = new LinkedList(live); + while (worklist.size() > 0) { + State s = worklist.removeFirst(); + for (State p : map.get(s)) + if (!live.contains(p)) { + live.add(p); + worklist.add(p); + } + } + return live; + } + + /** + * Removes transitions to dead states and calls {@link #reduce()} and + * {@link #clearHashCode()}. (A state is "dead" if no accept state is + * reachable from it.) + */ + public void removeDeadTransitions() { + clearHashCode(); + if (isSingleton()) return; + Set states = getStates(); + Set live = getLiveStates(states); + for (State s : states) { + Set st = s.transitions; + s.resetTransitions(); + for (Transition t : st) + if (live.contains(t.to)) s.transitions.add(t); + } + reduce(); + } + + /** + * Returns a sorted array of transitions for each state (and sets state + * numbers). + */ + static Transition[][] getSortedTransitions(Set states) { + setStateNumbers(states); + Transition[][] transitions = new Transition[states.size()][]; + for (State s : states) + transitions[s.number] = s.getSortedTransitionArray(false); + return transitions; + } + + /** + * Expands singleton representation to normal representation. Does nothing if + * not in singleton representation. + */ + public void expandSingleton() { + if (isSingleton()) { + State p = new State(); + initial = p; + for (int i = 0; i < singleton.length(); i++) { + State q = new State(); + p.transitions.add(new Transition(singleton.charAt(i), q)); + p = q; + } + p.accept = true; + deterministic = true; + singleton = null; + } + } + + /** + * Returns the number of states in this automaton. + */ + public int getNumberOfStates() { + if (isSingleton()) return singleton.length() + 1; + return getStates().size(); + } + + /** + * Returns the number of transitions in this automaton. This number is counted + * as the total number of edges, where one edge may be a character interval. + */ + public int getNumberOfTransitions() { + if (isSingleton()) return singleton.length(); + int c = 0; + for (State s : getStates()) + c += s.transitions.size(); + return c; + } + + /** + * Returns true if the language of this automaton is equal to the language of + * the given automaton. Implemented using hashCode and + * subsetOf. + */ + @Override + public boolean equals(Object obj) { + if (obj == this) return true; + if (!(obj instanceof Automaton)) return false; + Automaton a = (Automaton) obj; + if (isSingleton() && a.isSingleton()) return singleton.equals(a.singleton); + return hashCode() == a.hashCode() && BasicOperations.subsetOf(this, a) + && BasicOperations.subsetOf(a, this); + } + + /** + * Returns hash code for this automaton. The hash code is based on the number + * of states and transitions in the minimized automaton. Invoking this method + * may involve minimizing the automaton. + */ + @Override + public int hashCode() { + if (hash_code == 0) MinimizationOperations.minimize(this); + return hash_code; + } + + /** + * Must be invoked when the stored hash code may no longer be valid. + */ + void clearHashCode() { + hash_code = 0; + } + + /** + * Returns a string representation of this automaton. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + if (isSingleton()) { + b.append("singleton: "); + for (char c : singleton.toCharArray()) + Transition.appendCharString(c, b); + b.append("\n"); + } else { + Set states = getStates(); + setStateNumbers(states); + b.append("initial state: ").append(initial.number).append("\n"); + for (State s : states) + b.append(s.toString()); + } + return b.toString(); + } + + /** + * Returns Graphviz Dot representation of this automaton. + */ + public String toDot() { + StringBuilder b = new StringBuilder("digraph Automaton {\n"); + b.append(" rankdir = LR;\n"); + Set states = getStates(); + setStateNumbers(states); + for (State s : states) { + b.append(" ").append(s.number); + if (s.accept) b.append(" [shape=doublecircle,label=\"\"];\n"); + else b.append(" [shape=circle,label=\"\"];\n"); + if (s == initial) { + b.append(" initial [shape=plaintext,label=\"\"];\n"); + b.append(" initial -> ").append(s.number).append("\n"); + } + for (Transition t : s.transitions) { + b.append(" ").append(s.number); + t.appendDot(b); + } + } + return b.append("}\n").toString(); + } + + /** + * Returns a clone of this automaton, expands if singleton. + */ + Automaton cloneExpanded() { + Automaton a = clone(); + a.expandSingleton(); + return a; + } + + /** + * Returns a clone of this automaton unless allow_mutation is + * set, expands if singleton. + */ + Automaton cloneExpandedIfRequired() { + if (allow_mutation) { + expandSingleton(); + return this; + } else return cloneExpanded(); + } + + /** + * Returns a clone of this automaton. + */ + @Override + public Automaton clone() { + try { + Automaton a = (Automaton) super.clone(); + if (!isSingleton()) { + HashMap m = new HashMap(); + Set states = getStates(); + for (State s : states) + m.put(s, new State()); + for (State s : states) { + State p = m.get(s); + p.accept = s.accept; + if (s == initial) a.initial = p; + for (Transition t : s.transitions) + p.transitions.add(new Transition(t.min, t.max, m.get(t.to))); + } + } + return a; + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + /** + * Returns a clone of this automaton, or this automaton itself if + * allow_mutation flag is set. + */ + Automaton cloneIfRequired() { + if (allow_mutation) return this; + else return clone(); + } + + /** + * Retrieves a serialized Automaton located by a URL. + * + * @param url URL of serialized automaton + * @exception IOException if input/output related exception occurs + * @exception OptionalDataException if the data is not a serialized object + * @exception InvalidClassException if the class serial number does not match + * @exception ClassCastException if the data is not a serialized + * Automaton + * @exception ClassNotFoundException if the class of the serialized object + * cannot be found + */ + public static Automaton load(URL url) throws IOException, + OptionalDataException, ClassCastException, ClassNotFoundException, + InvalidClassException { + return load(url.openStream()); + } + + /** + * Retrieves a serialized Automaton from a stream. + * + * @param stream input stream with serialized automaton + * @exception IOException if input/output related exception occurs + * @exception OptionalDataException if the data is not a serialized object + * @exception InvalidClassException if the class serial number does not match + * @exception ClassCastException if the data is not a serialized + * Automaton + * @exception ClassNotFoundException if the class of the serialized object + * cannot be found + */ + public static Automaton load(InputStream stream) throws IOException, + OptionalDataException, ClassCastException, ClassNotFoundException, + InvalidClassException { + ObjectInputStream s = new ObjectInputStream(stream); + return (Automaton) s.readObject(); + } + + /** + * Writes this Automaton to the given stream. + * + * @param stream output stream for serialized automaton + * @exception IOException if input/output related exception occurs + */ + public void store(OutputStream stream) throws IOException { + ObjectOutputStream s = new ObjectOutputStream(stream); + s.writeObject(this); + s.flush(); + } + + /** + * See {@link BasicOperations#concatenate(Automaton, Automaton)}. + */ + public Automaton concatenate(Automaton a) { + return BasicOperations.concatenate(this, a); + } + + /** + * See {@link BasicOperations#concatenate(List)}. + */ + static public Automaton concatenate(List l) { + return BasicOperations.concatenate(l); + } + + /** + * See {@link BasicOperations#optional(Automaton)}. + */ + public Automaton optional() { + return BasicOperations.optional(this); + } + + /** + * See {@link BasicOperations#repeat(Automaton)}. + */ + public Automaton repeat() { + return BasicOperations.repeat(this); + } + + /** + * See {@link BasicOperations#repeat(Automaton, int)}. + */ + public Automaton repeat(int min) { + return BasicOperations.repeat(this, min); + } + + /** + * See {@link BasicOperations#repeat(Automaton, int, int)}. + */ + public Automaton repeat(int min, int max) { + return BasicOperations.repeat(this, min, max); + } + + /** + * See {@link BasicOperations#complement(Automaton)}. + */ + public Automaton complement() { + return BasicOperations.complement(this); + } + + /** + * See {@link BasicOperations#minus(Automaton, Automaton)}. + */ + public Automaton minus(Automaton a) { + return BasicOperations.minus(this, a); + } + + /** + * See {@link BasicOperations#intersection(Automaton, Automaton)}. + */ + public Automaton intersection(Automaton a) { + return BasicOperations.intersection(this, a); + } + + /** + * See {@link BasicOperations#subsetOf(Automaton, Automaton)}. + */ + public boolean subsetOf(Automaton a) { + return BasicOperations.subsetOf(this, a); + } + + /** + * See {@link BasicOperations#union(Automaton, Automaton)}. + */ + public Automaton union(Automaton a) { + return BasicOperations.union(this, a); + } + + /** + * See {@link BasicOperations#union(Collection)}. + */ + static public Automaton union(Collection l) { + return BasicOperations.union(l); + } + + /** + * See {@link BasicOperations#determinize(Automaton)}. + */ + public void determinize() { + BasicOperations.determinize(this); + } + + /** + * See {@link BasicOperations#addEpsilons(Automaton, Collection)}. + */ + public void addEpsilons(Collection pairs) { + BasicOperations.addEpsilons(this, pairs); + } + + /** + * See {@link BasicOperations#isEmptyString(Automaton)}. + */ + public boolean isEmptyString() { + return BasicOperations.isEmptyString(this); + } + + /** + * See {@link BasicOperations#run(Automaton, String)}. + */ + public boolean run(String s) { + return BasicOperations.run(this, s); + } + + /** + * See {@link MinimizationOperations#minimize(Automaton)}. Returns the + * automaton being given as argument. + */ + public static Automaton minimize(Automaton a) { + MinimizationOperations.minimize(a); + return a; + } +} Property changes on: src/java/org/apache/lucene/util/automaton/Automaton.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/RegExp.java =================================================================== --- src/java/org/apache/lucene/util/automaton/RegExp.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/RegExp.java (revision 0) @@ -0,0 +1,1003 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Regular Expression extension to Automaton. + *

+ * Regular expressions are built from the following abstract syntax: + *

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
regexp::=unionexp
|
unionexp::=interexp | unionexp(union)
|interexp
interexp::=concatexp & interexp(intersection)[OPTIONAL]
|concatexp
concatexp::=repeatexp concatexp(concatenation)
|repeatexp
repeatexp::=repeatexp ?(zero or one occurrence)
|repeatexp *(zero or more occurrences)
|repeatexp +(one or more occurrences)
|repeatexp {n}(n occurrences)
|repeatexp {n,}(n or more occurrences)
|repeatexp {n,m}(n to m occurrences, including both)
|complexp
complexp::=~ complexp(complement)[OPTIONAL]
|charclassexp
charclassexp::=[ charclasses ](character class)
|[^ charclasses ](negated character class)
|simpleexp
charclasses::=charclass charclasses
|charclass
charclass::=charexp - charexp(character range, including end-points)
|charexp
simpleexp::=charexp
|.(any single character)
|#(the empty language)[OPTIONAL]
|@(any string)[OPTIONAL]
|" <Unicode string without double-quotes>  "(a string)
|( )(the empty string)
|( unionexp )(precedence override)
|< <identifier> >(named automaton)[OPTIONAL]
|<n-m>(numerical interval)[OPTIONAL]
charexp::=<Unicode character>(a single non-reserved character)
|\ <Unicode character> (a single character)
+ *

+ * The productions marked [OPTIONAL] are only allowed if + * specified by the syntax flags passed to the RegExp constructor. + * The reserved characters used in the (enabled) syntax must be escaped with + * backslash (\) or double-quotes ("..."). (In + * contrast to other regexp syntaxes, this is required also in character + * classes.) Be aware that dash (-) has a special meaning in + * charclass expressions. An identifier is a string not containing right + * angle bracket (>) or dash (-). Numerical + * intervals are specified by non-negative decimal integers and include both end + * points, and if n and m have the same number + * of digits, then the conforming strings must have that length (i.e. prefixed + * by 0's). + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class RegExp { + + enum Kind { + REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL + } + + /** + * Syntax flag, enables intersection (&). + */ + public static final int INTERSECTION = 0x0001; + + /** + * Syntax flag, enables complement (~). + */ + public static final int COMPLEMENT = 0x0002; + + /** + * Syntax flag, enables empty language (#). + */ + public static final int EMPTY = 0x0004; + + /** + * Syntax flag, enables anystring (@). + */ + public static final int ANYSTRING = 0x0008; + + /** + * Syntax flag, enables named automata (<identifier>). + */ + public static final int AUTOMATON = 0x0010; + + /** + * Syntax flag, enables numerical intervals ( + * <n-m>). + */ + public static final int INTERVAL = 0x0020; + + /** + * Syntax flag, enables all optional regexp syntax. + */ + public static final int ALL = 0xffff; + + /** + * Syntax flag, enables no optional regexp syntax. + */ + public static final int NONE = 0x0000; + + private static boolean allow_mutation = false; + + Kind kind; + RegExp exp1, exp2; + String s; + char c; + int min, max, digits; + char from, to; + + String b; + int flags; + int pos; + + RegExp() {} + + /** + * Constructs new RegExp from a string. Same as + * RegExp(s, ALL). + * + * @param s regexp string + * @exception IllegalArgumentException if an error occured while parsing the + * regular expression + */ + public RegExp(String s) throws IllegalArgumentException { + this(s, ALL); + } + + /** + * Constructs new RegExp from a string. + * + * @param s regexp string + * @param syntax_flags boolean 'or' of optional syntax constructs to be + * enabled + * @exception IllegalArgumentException if an error occured while parsing the + * regular expression + */ + public RegExp(String s, int syntax_flags) throws IllegalArgumentException { + b = s; + flags = syntax_flags; + RegExp e; + if (s.length() == 0) e = makeString(""); + else { + e = parseUnionExp(); + if (pos < b.length()) throw new IllegalArgumentException( + "end-of-string expected at position " + pos); + } + kind = e.kind; + exp1 = e.exp1; + exp2 = e.exp2; + this.s = e.s; + c = e.c; + min = e.min; + max = e.max; + digits = e.digits; + from = e.from; + to = e.to; + b = null; + } + + /** + * Constructs new Automaton from this RegExp. Same + * as toAutomaton(null) (empty automaton map). + */ + public Automaton toAutomaton() { + return toAutomatonAllowMutate(null, null); + } + + /** + * Constructs new Automaton from this RegExp. The + * constructed automaton is minimal and deterministic and has no transitions + * to dead states. + * + * @param automaton_provider provider of automata for named identifiers + * @exception IllegalArgumentException if this regular expression uses a named + * identifier that is not available from the automaton provider + */ + public Automaton toAutomaton(AutomatonProvider automaton_provider) + throws IllegalArgumentException { + return toAutomatonAllowMutate(null, automaton_provider); + } + + /** + * Constructs new Automaton from this RegExp. The + * constructed automaton is minimal and deterministic and has no transitions + * to dead states. + * + * @param automata a map from automaton identifiers to automata (of type + * Automaton). + * @exception IllegalArgumentException if this regular expression uses a named + * identifier that does not occur in the automaton map + */ + public Automaton toAutomaton(Map automata) + throws IllegalArgumentException { + return toAutomatonAllowMutate(automata, null); + } + + /** + * Sets or resets allow mutate flag. If this flag is set, then automata + * construction uses mutable automata, which is slightly faster but not thread + * safe. By default, the flag is not set. + * + * @param flag if true, the flag is set + * @return previous value of the flag + */ + public boolean setAllowMutate(boolean flag) { + boolean b = allow_mutation; + allow_mutation = flag; + return b; + } + + private Automaton toAutomatonAllowMutate(Map automata, + AutomatonProvider automaton_provider) throws IllegalArgumentException { + boolean b = false; + if (allow_mutation) b = Automaton.setAllowMutate(true); // thread unsafe + Automaton a = toAutomaton(automata, automaton_provider); + if (allow_mutation) Automaton.setAllowMutate(b); + return a; + } + + private Automaton toAutomaton(Map automata, + AutomatonProvider automaton_provider) throws IllegalArgumentException { + List list; + Automaton a = null; + switch (kind) { + case REGEXP_UNION: + list = new ArrayList(); + findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider); + findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider); + a = BasicOperations.union(list); + MinimizationOperations.minimize(a); + break; + case REGEXP_CONCATENATION: + list = new ArrayList(); + findLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata, + automaton_provider); + findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata, + automaton_provider); + a = BasicOperations.concatenate(list); + MinimizationOperations.minimize(a); + break; + case REGEXP_INTERSECTION: + a = exp1.toAutomaton(automata, automaton_provider).intersection( + exp2.toAutomaton(automata, automaton_provider)); + MinimizationOperations.minimize(a); + break; + case REGEXP_OPTIONAL: + a = exp1.toAutomaton(automata, automaton_provider).optional(); + MinimizationOperations.minimize(a); + break; + case REGEXP_REPEAT: + a = exp1.toAutomaton(automata, automaton_provider).repeat(); + MinimizationOperations.minimize(a); + break; + case REGEXP_REPEAT_MIN: + a = exp1.toAutomaton(automata, automaton_provider).repeat(min); + MinimizationOperations.minimize(a); + break; + case REGEXP_REPEAT_MINMAX: + a = exp1.toAutomaton(automata, automaton_provider).repeat(min, max); + MinimizationOperations.minimize(a); + break; + case REGEXP_COMPLEMENT: + a = exp1.toAutomaton(automata, automaton_provider).complement(); + MinimizationOperations.minimize(a); + break; + case REGEXP_CHAR: + a = BasicAutomata.makeChar(c); + break; + case REGEXP_CHAR_RANGE: + a = BasicAutomata.makeCharRange(from, to); + break; + case REGEXP_ANYCHAR: + a = BasicAutomata.makeAnyChar(); + break; + case REGEXP_EMPTY: + a = BasicAutomata.makeEmpty(); + break; + case REGEXP_STRING: + a = BasicAutomata.makeString(s); + break; + case REGEXP_ANYSTRING: + a = BasicAutomata.makeAnyString(); + break; + case REGEXP_AUTOMATON: + Automaton aa = null; + if (automata != null) aa = automata.get(s); + if (aa == null && automaton_provider != null) try { + aa = automaton_provider.getAutomaton(s); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + if (aa == null) throw new IllegalArgumentException("'" + s + + "' not found"); + a = aa.clone(); // always clone here (ignore allow_mutate) + break; + case REGEXP_INTERVAL: + a = BasicAutomata.makeInterval(min, max, digits); + break; + } + return a; + } + + private void findLeaves(RegExp exp, Kind kind, List list, + Map automata, AutomatonProvider automaton_provider) { + if (exp.kind == kind) { + findLeaves(exp.exp1, kind, list, automata, automaton_provider); + findLeaves(exp.exp2, kind, list, automata, automaton_provider); + } else list.add(exp.toAutomaton(automata, automaton_provider)); + } + + /** + * Constructs string from parsed regular expression. + */ + @Override + public String toString() { + return toStringBuilder(new StringBuilder()).toString(); + } + + StringBuilder toStringBuilder(StringBuilder b) { + switch (kind) { + case REGEXP_UNION: + b.append("("); + exp1.toStringBuilder(b); + b.append("|"); + exp2.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_CONCATENATION: + exp1.toStringBuilder(b); + exp2.toStringBuilder(b); + break; + case REGEXP_INTERSECTION: + b.append("("); + exp1.toStringBuilder(b); + b.append("&"); + exp2.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_OPTIONAL: + b.append("("); + exp1.toStringBuilder(b); + b.append(")?"); + break; + case REGEXP_REPEAT: + b.append("("); + exp1.toStringBuilder(b); + b.append(")*"); + break; + case REGEXP_REPEAT_MIN: + b.append("("); + exp1.toStringBuilder(b); + b.append("){").append(min).append(",}"); + break; + case REGEXP_REPEAT_MINMAX: + b.append("("); + exp1.toStringBuilder(b); + b.append("){").append(min).append(",").append(max).append("}"); + break; + case REGEXP_COMPLEMENT: + b.append("~("); + exp1.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_CHAR: + b.append("\\").append(c); + break; + case REGEXP_CHAR_RANGE: + b.append("[\\").append(from).append("-\\").append(to).append("]"); + break; + case REGEXP_ANYCHAR: + b.append("."); + break; + case REGEXP_EMPTY: + b.append("#"); + break; + case REGEXP_STRING: + b.append("\"").append(s).append("\""); + break; + case REGEXP_ANYSTRING: + b.append("@"); + break; + case REGEXP_AUTOMATON: + b.append("<").append(s).append(">"); + break; + case REGEXP_INTERVAL: + String s1 = Integer.toString(min); + String s2 = Integer.toString(max); + b.append("<"); + if (digits > 0) for (int i = s1.length(); i < digits; i++) + b.append('0'); + b.append(s1).append("-"); + if (digits > 0) for (int i = s2.length(); i < digits; i++) + b.append('0'); + b.append(s2).append(">"); + break; + } + return b; + } + + /** + * Returns set of automaton identifiers that occur in this regular expression. + */ + public Set getIdentifiers() { + HashSet set = new HashSet(); + getIdentifiers(set); + return set; + } + + void getIdentifiers(Set set) { + switch (kind) { + case REGEXP_UNION: + case REGEXP_CONCATENATION: + case REGEXP_INTERSECTION: + exp1.getIdentifiers(set); + exp2.getIdentifiers(set); + break; + case REGEXP_OPTIONAL: + case REGEXP_REPEAT: + case REGEXP_REPEAT_MIN: + case REGEXP_REPEAT_MINMAX: + case REGEXP_COMPLEMENT: + exp1.getIdentifiers(set); + break; + case REGEXP_AUTOMATON: + set.add(s); + break; + default: + } + } + + static RegExp makeUnion(RegExp exp1, RegExp exp2) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_UNION; + r.exp1 = exp1; + r.exp2 = exp2; + return r; + } + + static RegExp makeConcatenation(RegExp exp1, RegExp exp2) { + if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) + && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString( + exp1, exp2); + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_CONCATENATION; + if (exp1.kind == Kind.REGEXP_CONCATENATION + && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING) + && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) { + r.exp1 = exp1.exp1; + r.exp2 = makeString(exp1.exp2, exp2); + } else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) + && exp2.kind == Kind.REGEXP_CONCATENATION + && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) { + r.exp1 = makeString(exp1, exp2.exp1); + r.exp2 = exp2.exp2; + } else { + r.exp1 = exp1; + r.exp2 = exp2; + } + return r; + } + + static private RegExp makeString(RegExp exp1, RegExp exp2) { + StringBuilder b = new StringBuilder(); + if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s); + else b.append(exp1.c); + if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s); + else b.append(exp2.c); + return makeString(b.toString()); + } + + static RegExp makeIntersection(RegExp exp1, RegExp exp2) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_INTERSECTION; + r.exp1 = exp1; + r.exp2 = exp2; + return r; + } + + static RegExp makeOptional(RegExp exp) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_OPTIONAL; + r.exp1 = exp; + return r; + } + + static RegExp makeRepeat(RegExp exp) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_REPEAT; + r.exp1 = exp; + return r; + } + + static RegExp makeRepeat(RegExp exp, int min) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_REPEAT_MIN; + r.exp1 = exp; + r.min = min; + return r; + } + + static RegExp makeRepeat(RegExp exp, int min, int max) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_REPEAT_MINMAX; + r.exp1 = exp; + r.min = min; + r.max = max; + return r; + } + + static RegExp makeComplement(RegExp exp) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_COMPLEMENT; + r.exp1 = exp; + return r; + } + + static RegExp makeChar(char c) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_CHAR; + r.c = c; + return r; + } + + static RegExp makeCharRange(char from, char to) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_CHAR_RANGE; + r.from = from; + r.to = to; + return r; + } + + static RegExp makeAnyChar() { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_ANYCHAR; + return r; + } + + static RegExp makeEmpty() { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_EMPTY; + return r; + } + + static RegExp makeString(String s) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_STRING; + r.s = s; + return r; + } + + static RegExp makeAnyString() { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_ANYSTRING; + return r; + } + + static RegExp makeAutomaton(String s) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_AUTOMATON; + r.s = s; + return r; + } + + static RegExp makeInterval(int min, int max, int digits) { + RegExp r = new RegExp(); + r.kind = Kind.REGEXP_INTERVAL; + r.min = min; + r.max = max; + r.digits = digits; + return r; + } + + private boolean peek(String s) { + return more() && s.indexOf(b.charAt(pos)) != -1; + } + + private boolean match(char c) { + if (pos >= b.length()) return false; + if (b.charAt(pos) == c) { + pos++; + return true; + } + return false; + } + + private boolean more() { + return pos < b.length(); + } + + private char next() throws IllegalArgumentException { + if (!more()) throw new IllegalArgumentException("unexpected end-of-string"); + return b.charAt(pos++); + } + + private boolean check(int flag) { + return (flags & flag) != 0; + } + + final RegExp parseUnionExp() throws IllegalArgumentException { + RegExp e = parseInterExp(); + if (match('|')) e = makeUnion(e, parseUnionExp()); + return e; + } + + final RegExp parseInterExp() throws IllegalArgumentException { + RegExp e = parseConcatExp(); + if (check(INTERSECTION) && match('&')) e = makeIntersection(e, + parseInterExp()); + return e; + } + + final RegExp parseConcatExp() throws IllegalArgumentException { + RegExp e = parseRepeatExp(); + if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation( + e, parseConcatExp()); + return e; + } + + final RegExp parseRepeatExp() throws IllegalArgumentException { + RegExp e = parseComplExp(); + while (peek("?*+{")) { + if (match('?')) e = makeOptional(e); + else if (match('*')) e = makeRepeat(e); + else if (match('+')) e = makeRepeat(e, 1); + else if (match('{')) { + int start = pos; + while (peek("0123456789")) + next(); + if (start == pos) throw new IllegalArgumentException( + "integer expected at position " + pos); + int n = Integer.parseInt(b.substring(start, pos)); + int m = -1; + if (match(',')) { + start = pos; + while (peek("0123456789")) + next(); + if (start != pos) m = Integer.parseInt(b.substring(start, pos)); + } else m = n; + if (!match('}')) throw new IllegalArgumentException( + "expected '}' at position " + pos); + if (m == -1) e = makeRepeat(e, n); + else e = makeRepeat(e, n, m); + } + } + return e; + } + + final RegExp parseComplExp() throws IllegalArgumentException { + if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp()); + else return parseCharClassExp(); + } + + final RegExp parseCharClassExp() throws IllegalArgumentException { + if (match('[')) { + boolean negate = false; + if (match('^')) negate = true; + RegExp e = parseCharClasses(); + if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e)); + if (!match(']')) throw new IllegalArgumentException( + "expected ']' at position " + pos); + return e; + } else return parseSimpleExp(); + } + + final RegExp parseCharClasses() throws IllegalArgumentException { + RegExp e = parseCharClass(); + while (more() && !peek("]")) + e = makeUnion(e, parseCharClass()); + return e; + } + + final RegExp parseCharClass() throws IllegalArgumentException { + char c = parseCharExp(); + if (match('-')) return makeCharRange(c, parseCharExp()); + else return makeChar(c); + } + + final RegExp parseSimpleExp() throws IllegalArgumentException { + if (match('.')) return makeAnyChar(); + else if (check(EMPTY) && match('#')) return makeEmpty(); + else if (check(ANYSTRING) && match('@')) return makeAnyString(); + else if (match('"')) { + int start = pos; + while (more() && !peek("\"")) + next(); + if (!match('"')) throw new IllegalArgumentException( + "expected '\"' at position " + pos); + return makeString(b.substring(start, pos - 1)); + } else if (match('(')) { + if (match(')')) return makeString(""); + RegExp e = parseUnionExp(); + if (!match(')')) throw new IllegalArgumentException( + "expected ')' at position " + pos); + return e; + } else if ((check(AUTOMATON) || check(INTERVAL)) && match('<')) { + int start = pos; + while (more() && !peek(">")) + next(); + if (!match('>')) throw new IllegalArgumentException( + "expected '>' at position " + pos); + String s = b.substring(start, pos - 1); + int i = s.indexOf('-'); + if (i == -1) { + if (!check(AUTOMATON)) throw new IllegalArgumentException( + "interval syntax error at position " + (pos - 1)); + return makeAutomaton(s); + } else { + if (!check(INTERVAL)) throw new IllegalArgumentException( + "illegal identifier at position " + (pos - 1)); + try { + if (i == 0 || i == s.length() - 1 || i != s.lastIndexOf('-')) throw new NumberFormatException(); + String smin = s.substring(0, i); + String smax = s.substring(i + 1, s.length()); + int imin = Integer.parseInt(smin); + int imax = Integer.parseInt(smax); + int digits; + if (smin.length() == smax.length()) digits = smin.length(); + else digits = 0; + if (imin > imax) { + int t = imin; + imin = imax; + imax = t; + } + return makeInterval(imin, imax, digits); + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "interval syntax error at position " + (pos - 1)); + } + } + } else return makeChar(parseCharExp()); + } + + final char parseCharExp() throws IllegalArgumentException { + match('\\'); + return next(); + } +} Property changes on: src/java/org/apache/lucene/util/automaton/RegExp.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/Transition.java =================================================================== --- src/java/org/apache/lucene/util/automaton/Transition.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/Transition.java (revision 0) @@ -0,0 +1,179 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.Serializable; + +/** + * Automaton transition. + *

+ * A transition, which belongs to a source state, consists of a Unicode + * character interval and a destination state. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class Transition implements Serializable, Cloneable { + + static final long serialVersionUID = 40001; + + /* + * CLASS INVARIANT: min<=max + */ + + char min; + char max; + + State to; + + /** + * Constructs a new singleton interval transition. + * + * @param c transition character + * @param to destination state + */ + public Transition(char c, State to) { + min = max = c; + this.to = to; + } + + /** + * Constructs a new transition. Both end points are included in the interval. + * + * @param min transition interval minimum + * @param max transition interval maximum + * @param to destination state + */ + public Transition(char min, char max, State to) { + if (max < min) { + char t = max; + max = min; + min = t; + } + this.min = min; + this.max = max; + this.to = to; + } + + /** Returns minimum of this transition interval. */ + public char getMin() { + return min; + } + + /** Returns maximum of this transition interval. */ + public char getMax() { + return max; + } + + /** Returns destination of this transition. */ + public State getDest() { + return to; + } + + /** + * Checks for equality. + * + * @param obj object to compare with + * @return true if obj is a transition with same character interval + * and destination state as this transition. + */ + @Override + public boolean equals(Object obj) { + if (obj instanceof Transition) { + Transition t = (Transition) obj; + return t.min == min && t.max == max && t.to == to; + } else return false; + } + + /** + * Returns hash code. The hash code is based on the character interval (not + * the destination state). + * + * @return hash code + */ + @Override + public int hashCode() { + return min * 2 + max * 3; + } + + /** + * Clones this transition. + * + * @return clone with same character interval and destination state + */ + @Override + public Transition clone() { + try { + return (Transition) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + static void appendCharString(char c, StringBuilder b) { + if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.append(c); + else { + b.append("\\u"); + String s = Integer.toHexString(c); + if (c < 0x10) b.append("000").append(s); + else if (c < 0x100) b.append("00").append(s); + else if (c < 0x1000) b.append("0").append(s); + else b.append(s); + } + } + + /** + * Returns a string describing this state. Normally invoked via + * {@link Automaton#toString()}. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + appendCharString(min, b); + if (min != max) { + b.append("-"); + appendCharString(max, b); + } + b.append(" -> ").append(to.number); + return b.toString(); + } + + void appendDot(StringBuilder b) { + b.append(" -> ").append(to.number).append(" [label=\""); + appendCharString(min, b); + if (min != max) { + b.append("-"); + appendCharString(max, b); + } + b.append("\"]\n"); + } +} Property changes on: src/java/org/apache/lucene/util/automaton/Transition.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/AutomatonMatcher.java =================================================================== --- src/java/org/apache/lucene/util/automaton/AutomatonMatcher.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/AutomatonMatcher.java (revision 0) @@ -0,0 +1,270 @@ +/* + * dk.brics.automaton - AutomatonMatcher + * + * Copyright (c) 2008 John Gibson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.regex.MatchResult; + +/** + * A tool that performs match operations on a given character sequence using a + * compiled automaton. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + * + * @see RunAutomaton#newMatcher(java.lang.CharSequence) + * @see RunAutomaton#newMatcher(java.lang.CharSequence, int, int) + */ +public class AutomatonMatcher implements MatchResult { + + AutomatonMatcher(final CharSequence chars, final RunAutomaton automaton) { + this.chars = chars; + this.automaton = automaton; + } + + private RunAutomaton automaton; + private CharSequence chars; + + private int matchStart = -1; + + private int matchEnd = -1; + + /** + * Reset this matcher to a new CharSequence + */ + public void reset(final CharSequence chars) { + this.chars = chars; + this.matchStart = -1; + this.matchEnd = -1; + } + + /** + * Find the next matching subsequence of the input.
+ * This also updates the values for the {@code start}, {@code end}, and + * {@code group} methods. + * + * @return {@code true} if there is a matching subsequence. + */ + public boolean find() { + int begin; + if (getMatchStart() == -2) { + return false; + } else if (getMatchStart() == -1) { + begin = 0; + } else { + begin = getMatchEnd(); + } + + int match_start; + int match_end; + if (automaton.isAccept(automaton.getInitialState())) { + match_start = begin; + match_end = begin; + } else { + match_start = -1; + match_end = -1; + } + int l = getChars().length(); + while (begin < l) { + int p = automaton.getInitialState(); + for (int i = begin; i < l; i += 1) { + final int new_state = automaton.step(p, getChars().charAt(i)); + if (new_state == -1) { + break; + } else if (automaton.isAccept(new_state)) { + if (match_start == -1) { + match_start = begin; + } + match_end = i; + } + p = new_state; + } + if (match_start != -1) { + setMatch(match_start, match_end + 1); + return true; + } + begin += 1; + } + if (match_start != -1) { + setMatch(match_start, match_end + 1); + return true; + } else { + setMatch(-2, -2); + return false; + } + } + + private void setMatch(final int matchStart, final int matchEnd) + throws IllegalArgumentException { + if (matchStart > matchEnd) { + throw new IllegalArgumentException( + "Start must be less than or equal to end: " + matchStart + ", " + + matchEnd); + } + this.matchStart = matchStart; + this.matchEnd = matchEnd; + } + + private int getMatchStart() { + return matchStart; + } + + private int getMatchEnd() { + return matchEnd; + } + + private CharSequence getChars() { + return chars; + } + + /** + * Returns the offset after the last character matched. + * + * @return The offset after the last character matched. + * @throws IllegalStateException if there has not been a match attempt or if + * the last attempt yielded no results. + */ + public int end() throws IllegalStateException { + matchGood(); + return matchEnd; + } + + /** + * Returns the offset after the last character matched of the specified + * capturing group.
+ * Note that because the automaton does not support capturing groups the only + * valid group is 0 (the entire match). + * + * @param group the desired capturing group. + * @return The offset after the last character matched of the specified + * capturing group. + * @throws IllegalStateException if there has not been a match attempt or if + * the last attempt yielded no results. + * @throws IndexOutOfBoundsException if the specified capturing group does not + * exist in the underlying automaton. + */ + public int end(final int group) throws IndexOutOfBoundsException, + IllegalStateException { + onlyZero(group); + return end(); + } + + /** + * Returns the subsequence of the input found by the previous match. + * + * @return The subsequence of the input found by the previous match. + * @throws IllegalStateException if there has not been a match attempt or if + * the last attempt yielded no results. + */ + public String group() throws IllegalStateException { + matchGood(); + return chars.subSequence(matchStart, matchEnd).toString(); + } + + /** + * Returns the subsequence of the input found by the specified capturing group + * during the previous match operation.
+ * Note that because the automaton does not support capturing groups the only + * valid group is 0 (the entire match). + * + * @param group the desired capturing group. + * @return The subsequence of the input found by the specified capturing group + * during the previous match operation the previous match. Or {@code + * null} if the given group did match. + * @throws IllegalStateException if there has not been a match attempt or if + * the last attempt yielded no results. + * @throws IndexOutOfBoundsException if the specified capturing group does not + * exist in the underlying automaton. + */ + public String group(final int group) throws IndexOutOfBoundsException, + IllegalStateException { + onlyZero(group); + return group(); + } + + /** + * Returns the number of capturing groups in the underlying automaton.
+ * Note that because the automaton does not support capturing groups this + * method will always return 0. + * + * @return The number of capturing groups in the underlying automaton. + */ + public int groupCount() { + return 0; + } + + /** + * Returns the offset of the first character matched. + * + * @return The offset of the first character matched. + * @throws IllegalStateException if there has not been a match attempt or if + * the last attempt yielded no results. + */ + public int start() throws IllegalStateException { + matchGood(); + return matchStart; + } + + /** + * Returns the offset of the first character matched of the specified + * capturing group.
+ * Note that because the automaton does not support capturing groups the only + * valid group is 0 (the entire match). + * + * @param group the desired capturing group. + * @return The offset of the first character matched of the specified + * capturing group. + * @throws IllegalStateException if there has not been a match attempt or if + * the last attempt yielded no results. + * @throws IndexOutOfBoundsException if the specified capturing group does not + * exist in the underlying automaton. + */ + public int start(int group) throws IndexOutOfBoundsException, + IllegalStateException { + onlyZero(group); + return start(); + } + + /** Helper method that requires the group argument to be 0. */ + private static void onlyZero(final int group) + throws IndexOutOfBoundsException { + if (group != 0) { + throw new IndexOutOfBoundsException("The only group supported is 0."); + } + } + + /** Helper method to check that the last match attempt was valid. */ + private void matchGood() throws IllegalStateException { + if ((matchStart < 0) || (matchEnd < 0)) { + throw new IllegalStateException("There was no available match."); + } + } +} Property changes on: src/java/org/apache/lucene/util/automaton/AutomatonMatcher.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/MinimizationOperations.java =================================================================== --- src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (revision 0) @@ -0,0 +1,278 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.Set; + +/** + * Operations for minimizing automata. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +final public class MinimizationOperations { + + private MinimizationOperations() {} + + /** + * Minimizes (and determinizes if not already deterministic) the given + * automaton. + * + * @see Automaton#setMinimization(int) + */ + public static void minimize(Automaton a) { + if (!a.isSingleton()) { + minimizeHopcroft(a); + } + // recompute hash code + a.hash_code = a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2; + if (a.hash_code == 0) a.hash_code = 1; + } + + private static void initialize(ArrayList list, int size) { + for (int i = 0; i < size; i++) + list.add(null); + } + + /** + * Minimizes the given automaton using Hopcroft's algorithm. + */ + public static void minimizeHopcroft(Automaton a) { + a.determinize(); + Set tr = a.initial.getTransitions(); + if (tr.size() == 1) { + Transition t = tr.iterator().next(); + if (t.to == a.initial && t.min == Character.MIN_VALUE + && t.max == Character.MAX_VALUE) return; + } + a.totalize(); + // make arrays for numbered states and effective alphabet + Set ss = a.getStates(); + State[] states = new State[ss.size()]; + int number = 0; + for (State q : ss) { + states[number] = q; + q.number = number++; + } + char[] sigma = a.getStartPoints(); + // initialize data structures + ArrayList>> reverse = new ArrayList>>(); + for (int q = 0; q < states.length; q++) { + ArrayList> v = new ArrayList>(); + initialize(v, sigma.length); + reverse.add(v); + } + boolean[][] reverse_nonempty = new boolean[states.length][sigma.length]; + ArrayList> partition = new ArrayList>(); + initialize(partition, states.length); + int[] block = new int[states.length]; + StateList[][] active = new StateList[states.length][sigma.length]; + StateListNode[][] active2 = new StateListNode[states.length][sigma.length]; + LinkedList pending = new LinkedList(); + boolean[][] pending2 = new boolean[sigma.length][states.length]; + ArrayList split = new ArrayList(); + boolean[] split2 = new boolean[states.length]; + ArrayList refine = new ArrayList(); + boolean[] refine2 = new boolean[states.length]; + ArrayList> splitblock = new ArrayList>(); + initialize(splitblock, states.length); + for (int q = 0; q < states.length; q++) { + splitblock.set(q, new ArrayList()); + partition.set(q, new LinkedList()); + for (int x = 0; x < sigma.length; x++) { + reverse.get(q).set(x, new LinkedList()); + active[q][x] = new StateList(); + } + } + // find initial partition and reverse edges + for (int q = 0; q < states.length; q++) { + State qq = states[q]; + int j; + if (qq.accept) j = 0; + else j = 1; + partition.get(j).add(qq); + block[qq.number] = j; + for (int x = 0; x < sigma.length; x++) { + char y = sigma[x]; + State p = qq.step(y); + reverse.get(p.number).get(x).add(qq); + reverse_nonempty[p.number][x] = true; + } + } + // initialize active sets + for (int j = 0; j <= 1; j++) + for (int x = 0; x < sigma.length; x++) + for (State qq : partition.get(j)) + if (reverse_nonempty[qq.number][x]) active2[qq.number][x] = active[j][x] + .add(qq); + // initialize pending + for (int x = 0; x < sigma.length; x++) { + int a0 = active[0][x].size; + int a1 = active[1][x].size; + int j; + if (a0 <= a1) j = 0; + else j = 1; + pending.add(new IntPair(j, x)); + pending2[x][j] = true; + } + // process pending until fixed point + int k = 2; + while (!pending.isEmpty()) { + IntPair ip = pending.removeFirst(); + int p = ip.n1; + int x = ip.n2; + pending2[x][p] = false; + // find states that need to be split off their blocks + for (StateListNode m = active[p][x].first; m != null; m = m.next) + for (State s : reverse.get(m.q.number).get(x)) + if (!split2[s.number]) { + split2[s.number] = true; + split.add(s); + int j = block[s.number]; + splitblock.get(j).add(s); + if (!refine2[j]) { + refine2[j] = true; + refine.add(j); + } + } + // refine blocks + for (int j : refine) { + if (splitblock.get(j).size() < partition.get(j).size()) { + LinkedList b1 = partition.get(j); + LinkedList b2 = partition.get(k); + for (State s : splitblock.get(j)) { + b1.remove(s); + b2.add(s); + block[s.number] = k; + for (int c = 0; c < sigma.length; c++) { + StateListNode sn = active2[s.number][c]; + if (sn != null && sn.sl == active[j][c]) { + sn.remove(); + active2[s.number][c] = active[k][c].add(s); + } + } + } + // update pending + for (int c = 0; c < sigma.length; c++) { + int aj = active[j][c].size; + int ak = active[k][c].size; + if (!pending2[c][j] && 0 < aj && aj <= ak) { + pending2[c][j] = true; + pending.add(new IntPair(j, c)); + } else { + pending2[c][k] = true; + pending.add(new IntPair(k, c)); + } + } + k++; + } + for (State s : splitblock.get(j)) + split2[s.number] = false; + refine2[j] = false; + splitblock.get(j).clear(); + } + split.clear(); + refine.clear(); + } + // make a new state for each equivalence class, set initial state + State[] newstates = new State[k]; + for (int n = 0; n < newstates.length; n++) { + State s = new State(); + newstates[n] = s; + for (State q : partition.get(n)) { + if (q == a.initial) a.initial = s; + s.accept = q.accept; + s.number = q.number; // select representative + q.number = n; + } + } + // build transitions and set acceptance + for (int n = 0; n < newstates.length; n++) { + State s = newstates[n]; + s.accept = states[s.number].accept; + for (Transition t : states[s.number].transitions) + s.transitions.add(new Transition(t.min, t.max, newstates[t.to.number])); + } + a.removeDeadTransitions(); + } + + static class IntPair { + + int n1, n2; + + IntPair(int n1, int n2) { + this.n1 = n1; + this.n2 = n2; + } + } + + static class StateList { + + int size; + + StateListNode first, last; + + StateListNode add(State q) { + return new StateListNode(q, this); + } + } + + static class StateListNode { + + State q; + + StateListNode next, prev; + + StateList sl; + + StateListNode(State q, StateList sl) { + this.q = q; + this.sl = sl; + if (sl.size++ == 0) sl.first = sl.last = this; + else { + sl.last.next = this; + prev = sl.last; + sl.last = this; + } + } + + void remove() { + sl.size--; + if (sl.first == this) sl.first = next; + else prev.next = next; + if (sl.last == this) sl.last = prev; + else next.prev = prev; + } + } +} Property changes on: src/java/org/apache/lucene/util/automaton/MinimizationOperations.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/StatePair.java =================================================================== --- src/java/org/apache/lucene/util/automaton/StatePair.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/StatePair.java (revision 0) @@ -0,0 +1,104 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +/** + * Pair of states. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class StatePair { + State s; + State s1; + State s2; + + StatePair(State s, State s1, State s2) { + this.s = s; + this.s1 = s1; + this.s2 = s2; + } + + /** + * Constructs a new state pair. + * + * @param s1 first state + * @param s2 second state + */ + public StatePair(State s1, State s2) { + this.s1 = s1; + this.s2 = s2; + } + + /** + * Returns first component of this pair. + * + * @return first state + */ + public State getFirstState() { + return s1; + } + + /** + * Returns second component of this pair. + * + * @return second state + */ + public State getSecondState() { + return s2; + } + + /** + * Checks for equality. + * + * @param obj object to compare with + * @return true if obj represents the same pair of states as this + * pair + */ + @Override + public boolean equals(Object obj) { + if (obj instanceof StatePair) { + StatePair p = (StatePair) obj; + return p.s1 == s1 && p.s2 == s2; + } else return false; + } + + /** + * Returns hash code. + * + * @return hash code + */ + @Override + public int hashCode() { + return s1.hashCode() + s2.hashCode(); + } +} Property changes on: src/java/org/apache/lucene/util/automaton/StatePair.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/RunAutomaton.java =================================================================== --- src/java/org/apache/lucene/util/automaton/RunAutomaton.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/RunAutomaton.java (revision 0) @@ -0,0 +1,305 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InvalidClassException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.OptionalDataException; +import java.io.OutputStream; +import java.io.Serializable; +import java.net.URL; +import java.util.Set; + +/** + * Finite-state automaton with fast run operation. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class RunAutomaton implements Serializable { + + static final long serialVersionUID = 20001; + + int size; + boolean[] accept; + int initial; + int[] transitions; // delta(state,c) = transitions[state*points.length + + // getCharClass(c)] + char[] points; // char interval start points + int[] classmap; // map from char number to class class + + /** + * Sets alphabet table for optimal run performance. + */ + final void setAlphabet() { + classmap = new int[Character.MAX_VALUE - Character.MIN_VALUE + 1]; + int i = 0; + for (int j = 0; j <= Character.MAX_VALUE - Character.MIN_VALUE; j++) { + if (i + 1 < points.length && j == points[i + 1]) i++; + classmap[j] = i; + } + } + + /** + * Returns a string representation of this automaton. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append("initial state: ").append(initial).append("\n"); + for (int i = 0; i < size; i++) { + b.append("state " + i); + if (accept[i]) b.append(" [accept]:\n"); + else b.append(" [reject]:\n"); + for (int j = 0; j < points.length; j++) { + int k = transitions[i * points.length + j]; + if (k != -1) { + char min = points[j]; + char max; + if (j + 1 < points.length) max = (char) (points[j + 1] - 1); + else max = Character.MAX_VALUE; + b.append(" "); + Transition.appendCharString(min, b); + if (min != max) { + b.append("-"); + Transition.appendCharString(max, b); + } + b.append(" -> ").append(k).append("\n"); + } + } + } + return b.toString(); + } + + /** + * Returns number of states in automaton. + */ + public int getSize() { + return size; + } + + /** + * Returns acceptance status for given state. + */ + public boolean isAccept(int state) { + return accept[state]; + } + + /** + * Returns initial state. + */ + public int getInitialState() { + return initial; + } + + /** + * Returns array of character class interval start points. The array should + * not be modified by the caller. + */ + public char[] getCharIntervals() { + return points.clone(); + } + + /** + * Gets character class of given char. + */ + int getCharClass(char c) { + return SpecialOperations.findIndex(c, points); + } + + @SuppressWarnings("unused") + private RunAutomaton() {} + + /** + * Constructs a new RunAutomaton from a deterministic + * Automaton. Same as RunAutomaton(a, true). + * + * @param a an automaton + */ + public RunAutomaton(Automaton a) { + this(a, true); + } + + /** + * Retrieves a serialized RunAutomaton located by a URL. + * + * @param url URL of serialized automaton + * @exception IOException if input/output related exception occurs + * @exception OptionalDataException if the data is not a serialized object + * @exception InvalidClassException if the class serial number does not match + * @exception ClassCastException if the data is not a serialized + * RunAutomaton + * @exception ClassNotFoundException if the class of the serialized object + * cannot be found + */ + public static RunAutomaton load(URL url) throws IOException, + OptionalDataException, ClassCastException, ClassNotFoundException, + InvalidClassException { + return load(url.openStream()); + } + + /** + * Retrieves a serialized RunAutomaton from a stream. + * + * @param stream input stream with serialized automaton + * @exception IOException if input/output related exception occurs + * @exception OptionalDataException if the data is not a serialized object + * @exception InvalidClassException if the class serial number does not match + * @exception ClassCastException if the data is not a serialized + * RunAutomaton + * @exception ClassNotFoundException if the class of the serialized object + * cannot be found + */ + public static RunAutomaton load(InputStream stream) throws IOException, + OptionalDataException, ClassCastException, ClassNotFoundException, + InvalidClassException { + ObjectInputStream s = new ObjectInputStream(stream); + return (RunAutomaton) s.readObject(); + } + + /** + * Writes this RunAutomaton to the given stream. + * + * @param stream output stream for serialized automaton + * @exception IOException if input/output related exception occurs + */ + public void store(OutputStream stream) throws IOException { + ObjectOutputStream s = new ObjectOutputStream(stream); + s.writeObject(this); + s.flush(); + } + + /** + * Constructs a new RunAutomaton from a deterministic + * Automaton. If the given automaton is not deterministic, it is + * determinized first. + * + * @param a an automaton + * @param tableize if true, a transition table is created which makes the + * run method faster in return of a higher memory usage + */ + public RunAutomaton(Automaton a, boolean tableize) { + a.determinize(); + points = a.getStartPoints(); + Set states = a.getStates(); + Automaton.setStateNumbers(states); + initial = a.initial.number; + size = states.size(); + accept = new boolean[size]; + transitions = new int[size * points.length]; + for (int n = 0; n < size * points.length; n++) + transitions[n] = -1; + for (State s : states) { + int n = s.number; + accept[n] = s.accept; + for (int c = 0; c < points.length; c++) { + State q = s.step(points[c]); + if (q != null) transitions[n * points.length + c] = q.number; + } + } + if (tableize) setAlphabet(); + } + + /** + * Returns the state obtained by reading the given char from the given state. + * Returns -1 if not obtaining any such state. (If the original + * Automaton had no dead states, -1 is returned here if and only + * if a dead state is entered in an equivalent automaton with a total + * transition function.) + */ + public int step(int state, char c) { + if (classmap == null) return transitions[state * points.length + + getCharClass(c)]; + else return transitions[state * points.length + + classmap[c - Character.MIN_VALUE]]; + } + + /** + * Returns true if the given string is accepted by this automaton. + */ + public boolean run(String s) { + int p = initial; + int l = s.length(); + for (int i = 0; i < l; i++) { + p = step(p, s.charAt(i)); + if (p == -1) return false; + } + return accept[p]; + } + + /** + * Returns the length of the longest accepted run of the given string starting + * at the given offset. + * + * @param s the string + * @param offset offset into s where the run starts + * @return length of the longest accepted run, -1 if no run is accepted + */ + public int run(String s, int offset) { + int p = initial; + int l = s.length(); + int max = -1; + for (int r = 0; offset <= l; offset++, r++) { + if (accept[p]) max = r; + if (offset == l) break; + p = step(p, s.charAt(offset)); + if (p == -1) break; + } + return max; + } + + /** + * Creates a new automaton matcher for the given input. + * + * @param s the CharSequence to search + * @return A new automaton matcher for the given input + */ + public AutomatonMatcher newMatcher(CharSequence s) { + return new AutomatonMatcher(s, this); + } + + /** + * Creates a new automaton matcher for the given input. + * + * @param s the CharSequence to search + * @param startOffset the starting offset of the given character sequence + * @param endOffset the ending offset of the given character sequence + * @return A new automaton matcher for the given input + */ + public AutomatonMatcher newMatcher(CharSequence s, int startOffset, + int endOffset) { + return new AutomatonMatcher(s.subSequence(startOffset, endOffset), this); + } +} Property changes on: src/java/org/apache/lucene/util/automaton/RunAutomaton.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/BasicAutomata.java =================================================================== --- src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 0) +++ src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 0) @@ -0,0 +1,482 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.util.automaton; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; + +/** + * Construction of basic automata. + * + *

+ * WARNING: The status of the Automaton feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +final public class BasicAutomata { + // used by getWhitespaceAutomaton to match basic whitespace + private static final Automaton ws = Automaton.minimize(BasicAutomata + .makeCharSet(" \t\n\r").repeat()); + + private BasicAutomata() {} + + /** + * Returns a new (deterministic) automaton with the empty language. + */ + public static Automaton makeEmpty() { + Automaton a = new Automaton(); + State s = new State(); + a.initial = s; + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts only the empty string. + */ + public static Automaton makeEmptyString() { + Automaton a = new Automaton(); + a.singleton = ""; + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts all strings. + */ + public static Automaton makeAnyString() { + Automaton a = new Automaton(); + State s = new State(); + a.initial = s; + s.accept = true; + s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + s)); + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts any single character. + */ + public static Automaton makeAnyChar() { + return makeCharRange(Character.MIN_VALUE, Character.MAX_VALUE); + } + + /** + * Returns a new (deterministic) automaton that accepts a single character of + * the given value. + */ + public static Automaton makeChar(char c) { + Automaton a = new Automaton(); + a.singleton = Character.toString(c); + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts a single char whose + * value is in the given interval (including both end points). + */ + public static Automaton makeCharRange(char min, char max) { + if (min == max) return makeChar(min); + Automaton a = new Automaton(); + State s1 = new State(); + State s2 = new State(); + a.initial = s1; + s2.accept = true; + if (min <= max) s1.transitions.add(new Transition(min, max, s2)); + a.deterministic = true; + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts a single character in + * the given set. + */ + public static Automaton makeCharSet(String set) { + if (set.length() == 1) return makeChar(set.charAt(0)); + Automaton a = new Automaton(); + State s1 = new State(); + State s2 = new State(); + a.initial = s1; + s2.accept = true; + for (int i = 0; i < set.length(); i++) + s1.transitions.add(new Transition(set.charAt(i), s2)); + a.deterministic = true; + a.reduce(); + return a; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of length + * x.substring(n).length(). + */ + private static State anyOfRightLength(String x, int n) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else s.addTransition(new Transition('0', '9', anyOfRightLength(x, n + 1))); + return s; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of value at least + * x.substring(n) and length x.substring(n).length(). + */ + private static State atLeast(String x, int n, Collection initials, + boolean zeros) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else { + if (zeros) initials.add(s); + char c = x.charAt(n); + s.addTransition(new Transition(c, atLeast(x, n + 1, initials, zeros + && c == '0'))); + if (c < '9') s.addTransition(new Transition((char) (c + 1), '9', + anyOfRightLength(x, n + 1))); + } + return s; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of value at most + * x.substring(n) and length x.substring(n).length(). + */ + private static State atMost(String x, int n) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else { + char c = x.charAt(n); + s.addTransition(new Transition(c, atMost(x, (char) n + 1))); + if (c > '0') s.addTransition(new Transition('0', (char) (c - 1), + anyOfRightLength(x, n + 1))); + } + return s; + } + + /** + * Constructs sub-automaton corresponding to decimal numbers of value between + * x.substring(n) and y.substring(n) and of length x.substring(n).length() + * (which must be equal to y.substring(n).length()). + */ + private static State between(String x, String y, int n, + Collection initials, boolean zeros) { + State s = new State(); + if (x.length() == n) s.setAccept(true); + else { + if (zeros) initials.add(s); + char cx = x.charAt(n); + char cy = y.charAt(n); + if (cx == cy) s.addTransition(new Transition(cx, between(x, y, n + 1, + initials, zeros && cx == '0'))); + else { // cx0, use fixed number of digits (strings must be prefixed + * by 0's to obtain the right length) - otherwise, the number of + * digits is not fixed + * @exception IllegalArgumentException if min>max or if numbers in the + * interval cannot be expressed with the given fixed number of + * digits + */ + public static Automaton makeInterval(int min, int max, int digits) + throws IllegalArgumentException { + Automaton a = new Automaton(); + String x = Integer.toString(min); + String y = Integer.toString(max); + if (min > max || (digits > 0 && y.length() > digits)) throw new IllegalArgumentException(); + int d; + if (digits > 0) d = digits; + else d = y.length(); + StringBuilder bx = new StringBuilder(); + for (int i = x.length(); i < d; i++) + bx.append('0'); + bx.append(x); + x = bx.toString(); + StringBuilder by = new StringBuilder(); + for (int i = y.length(); i < d; i++) + by.append('0'); + by.append(y); + y = by.toString(); + Collection initials = new ArrayList(); + a.initial = between(x, y, 0, initials, digits <= 0); + if (digits <= 0) { + ArrayList pairs = new ArrayList(); + for (State p : initials) + if (a.initial != p) pairs.add(new StatePair(a.initial, p)); + a.addEpsilons(pairs); + a.initial.addTransition(new Transition('0', a.initial)); + a.deterministic = false; + } else a.deterministic = true; + a.checkMinimizeAlways(); + return a; + } + + /** + * Returns a new (deterministic) automaton that accepts the single given + * string. + */ + public static Automaton makeString(String s) { + Automaton a = new Automaton(); + a.singleton = s; + a.deterministic = true; + return a; + } + + /** + * Constructs automaton that accept strings representing nonnegative integers + * that are not larger than the given value. + * + * @param n string representation of maximum value + */ + public static Automaton makeMaxInteger(String n) { + int i = 0; + while (i < n.length() && n.charAt(i) == '0') + i++; + StringBuilder b = new StringBuilder(); + b.append("0*(0|"); + if (i < n.length()) b.append("[0-9]{1," + (n.length() - i - 1) + "}|"); + maxInteger(n.substring(i), 0, b); + b.append(")"); + return Automaton.minimize((new RegExp(b.toString())).toAutomaton()); + } + + private static void maxInteger(String n, int i, StringBuilder b) { + b.append('('); + if (i < n.length()) { + char c = n.charAt(i); + if (c != '0') b.append("[0-" + (char) (c - 1) + "][0-9]{" + + (n.length() - i - 1) + "}|"); + b.append(c); + maxInteger(n, i + 1, b); + } + b.append(')'); + } + + /** + * Constructs automaton that accept strings representing nonnegative integers + * that are not less that the given value. + * + * @param n string representation of minimum value + */ + public static Automaton makeMinInteger(String n) { + int i = 0; + while (i + 1 < n.length() && n.charAt(i) == '0') + i++; + StringBuilder b = new StringBuilder(); + b.append("0*"); + minInteger(n.substring(i), 0, b); + b.append("[0-9]*"); + return Automaton.minimize((new RegExp(b.toString())).toAutomaton()); + } + + private static void minInteger(String n, int i, StringBuilder b) { + b.append('('); + if (i < n.length()) { + char c = n.charAt(i); + if (c != '9') b.append("[" + (char) (c + 1) + "-9][0-9]{" + + (n.length() - i - 1) + "}|"); + b.append(c); + minInteger(n, i + 1, b); + } + b.append(')'); + } + + /** + * Constructs automaton that accept strings representing decimal numbers that + * can be written with at most the given number of digits. Surrounding + * whitespace is permitted. + * + * @param i max number of necessary digits + */ + public static Automaton makeTotalDigits(int i) { + return Automaton.minimize((new RegExp("[ \t\n\r]*[-+]?0*([0-9]{0," + i + + "}|((([0-9]\\.*){0," + i + "})&@\\.@)0*)[ \t\n\r]*")).toAutomaton()); + } + + /** + * Constructs automaton that accept strings representing decimal numbers that + * can be written with at most the given number of digits in the fraction + * part. Surrounding whitespace is permitted. + * + * @param i max number of necessary fraction digits + */ + public static Automaton makeFractionDigits(int i) { + return Automaton.minimize((new RegExp("[ \t\n\r]*[-+]?[0-9]+(\\.[0-9]{0," + + i + "}0*)?[ \t\n\r]*")).toAutomaton()); + } + + /** + * Constructs automaton that accept strings representing the given integer. + * Surrounding whitespace is permitted. + * + * @param value string representation of integer + */ + public static Automaton makeIntegerValue(String value) { + boolean minus = false; + int i = 0; + while (i < value.length()) { + char c = value.charAt(i); + if (c == '-') minus = true; + if (c >= '1' && c <= '9') break; + i++; + } + StringBuilder b = new StringBuilder(); + b.append(value.substring(i)); + if (b.length() == 0) b.append("0"); + Automaton s; + if (minus) s = makeChar('-'); + else s = makeChar('+').optional(); + Automaton ws = getWhitespaceAutomaton(); + return Automaton.minimize(ws.concatenate( + s.concatenate(makeChar('0').repeat()).concatenate( + makeString(b.toString()))).concatenate(ws)); + } + + /** + * Constructs automaton that accept strings representing the given decimal + * number. Surrounding whitespace is permitted. + * + * @param value string representation of decimal number + */ + public static Automaton makeDecimalValue(String value) { + boolean minus = false; + int i = 0; + while (i < value.length()) { + char c = value.charAt(i); + if (c == '-') minus = true; + if ((c >= '1' && c <= '9') || c == '.') break; + i++; + } + StringBuilder b1 = new StringBuilder(); + StringBuilder b2 = new StringBuilder(); + int p = value.indexOf('.', i); + if (p == -1) b1.append(value.substring(i)); + else { + b1.append(value.substring(i, p)); + i = value.length() - 1; + while (i > p) { + char c = value.charAt(i); + if (c >= '1' && c <= '9') break; + i--; + } + b2.append(value.substring(p + 1, i + 1)); + } + if (b1.length() == 0) b1.append("0"); + Automaton s; + if (minus) s = makeChar('-'); + else s = makeChar('+').optional(); + Automaton d; + if (b2.length() == 0) d = makeChar('.') + .concatenate(makeChar('0').repeat(1)).optional(); + else d = makeChar('.').concatenate(makeString(b2.toString())).concatenate( + makeChar('0').repeat()); + Automaton ws = getWhitespaceAutomaton(); + return Automaton.minimize(ws.concatenate( + s.concatenate(makeChar('0').repeat()).concatenate( + makeString(b1.toString())).concatenate(d)).concatenate(ws)); + } + + /** + * Constructs deterministic automaton that matches strings that contain the + * given substring. + */ + public static Automaton makeStringMatcher(String s) { + Automaton a = new Automaton(); + State[] states = new State[s.length() + 1]; + states[0] = a.initial; + for (int i = 0; i < s.length(); i++) + states[i + 1] = new State(); + State f = states[s.length()]; + f.accept = true; + f.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + f)); + for (int i = 0; i < s.length(); i++) { + Set done = new HashSet(); + char c = s.charAt(i); + states[i].transitions.add(new Transition(c, states[i + 1])); + done.add(c); + for (int j = i; j >= 1; j--) { + char d = s.charAt(j - 1); + if (!done.contains(d) + && s.substring(0, j - 1).equals(s.substring(i - j + 1, i))) { + states[i].transitions.add(new Transition(d, states[j])); + done.add(d); + } + } + char[] da = new char[done.size()]; + int h = 0; + for (char w : done) + da[h++] = w; + Arrays.sort(da); + int from = Character.MIN_VALUE; + int k = 0; + while (from <= Character.MAX_VALUE) { + while (k < da.length && da[k] == from) { + k++; + from++; + } + if (from <= Character.MAX_VALUE) { + int to = Character.MAX_VALUE; + if (k < da.length) { + to = da[k] - 1; + k++; + } + states[i].transitions.add(new Transition((char) from, (char) to, + states[0])); + from = to + 2; + } + } + } + a.deterministic = true; + return a; + } + + public static Automaton getWhitespaceAutomaton() { + return ws; + } +} Property changes on: src/java/org/apache/lucene/util/automaton/BasicAutomata.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/util/automaton/package.html =================================================================== --- src/java/org/apache/lucene/util/automaton/package.html (revision 0) +++ src/java/org/apache/lucene/util/automaton/package.html (revision 0) @@ -0,0 +1,50 @@ + + + + +Finite-state automaton for regular expressions. +

+This package contains a full DFA/NFA implementation with Unicode +alphabet and support for all standard (and a number of non-standard) +regular expression operations. +

+The most commonly used functionality is located in the classes +{@link org.apache.lucene.util.automaton.Automaton} and +{@link org.apache.lucene.util.automaton.RegExp}. +

+For more information, go to the package home page at +http://www.brics.dk/automaton/. +

+WARNING: The status of the Automaton feature is experimental. +The APIs introduced here might change in the future and will not be +supported anymore in such a case. + + Property changes on: src/java/org/apache/lucene/util/automaton/package.html ___________________________________________________________________ Added: svn:eol-style + native Index: NOTICE.txt =================================================================== --- NOTICE.txt (revision 883088) +++ NOTICE.txt (working copy) @@ -28,3 +28,6 @@ ICU4J, (under contrib/collation) is licensed under an MIT styles license (contrib/collation/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Brics Automaton (under src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/