transitions = state.getSortedTransitions(false);
+ transitionCache.put(state, transitions
+ .toArray(new Transition[transitions.size()]));
+ }
+
+ String startPoint = cleanupPosition(nextString(""));
+
+ /*
+ * in this case this automaton will not accept any strings. start the
+ * enumeration at the empty string, next() will return false.
+ */
+ if (startPoint == null) {
+ startPoint = "";
+ }
+
+ lastTerm = queryTerm.createTerm(startPoint);
+ }
+
+ setEnum(reader.terms(lastTerm));
+ }
+
+ /**
+ * Construct an enumerator based upon an automaton, enumerating the specified
+ * field, working on a supplied reader.
+ *
+ * It will automagically determine whether or not to enumerate the term dictionary
+ * in a smart way, or to just do a linear scan depending upon a heuristic.
+ */
+ public AutomatonTermEnum(Automaton automaton, Term queryTerm, IndexReader reader)
+ throws IOException {
+ this(automaton, queryTerm, reader, AutomatonTermEnum.isSlow(automaton));
+ }
+
+ /**
+ * Heuristic to detect if an automaton will be so slow,
+ * that it is better to do a linear enumeration.
+ *
+ * A very slow automaton will simply cause a lot of wasted disk seeks.
+ * Instead in that case it is actually faster to do a linear enumeration.
+ *
+ * @param automaton automaton
+ * @return true if it will result in bad search performance
+ */
+ private static boolean isSlow(Automaton automaton) {
+ /*
+ * If the DFA has a leading kleene star, or something similar, it will
+ * need to run against the entire term dictionary. In this case its much
+ * better to do just that than to use smart enumeration.
+ *
+ * this heuristic looks for an initial loop, with a range of at least 1/3
+ * of the unicode BMP.
+ */
+ State initialState = automaton.getInitialState();
+ boolean linearMode = false;
+ for (Transition transition : initialState.getTransitions()) {
+ if (transition.getDest() == initialState &&
+ (transition.getMax() - transition.getMin()) > (Character.MAX_VALUE / 3)) {
+ linearMode = true;
+ break;
+ }
+ }
+ return linearMode;
+ }
+
+ /**
+ * Returns {@code true} if the enum is in linear mode, {@code false} in smart mode.
+ */
+ public final boolean usesLinearMode() {
+ return linearMode;
+ }
+
+ @Override
+ public float difference() {
+ return 1.0f;
+ }
+
+ /**
+ * Returns true if the term matches the automaton. Also stashes away the term
+ * to assist with smart enumeration.
+ *
In linear mode, it also sets {@link #endEnum} if the enumeration is exhausted.
+ * In smart mode, it will never do this.
+ */
+ @Override
+ protected boolean termCompare(final Term term) {
+ lastTerm = term;
+ final String text = term.text();
+ if (term.field() == queryTerm.field()) {
+ return (!linearMode || text.endsWith(commonSuffix)) && runAutomaton.run(text);
+ } else {
+ // only set endEnum in linearMode
+ endEnum = linearMode;
+ return false;
+ }
+ }
+
+ /**
+ * In smart mode, increments to the next term matching this automaton.
+ * After a successful comparison, it simply tries the next term.
+ * After an unsuccessful comparison, it seeks to a smarter position.
+ *
If the enum is in linear mode, it simply calls {@code super.next()} to
+ * just filter the current enum until {@link #endEnum} returns {@code true}.
+ */
+ @Override
+ public boolean next() throws IOException {
+ if (linearMode)
+ return super.next();
+
+ do {
+ /*
+ * if the previous enumeration was a match, don't even bother
+ * trying to compute the next place to seek to.
+ * this is an optimization for a DFA that matches many sequential terms,
+ * such as ab*
+ * we only do this if the automaton is infinite.
+ */
+ if (!finite && lastTerm == currentTerm) {
+ actualEnum.next();
+ } else {
+ // seek to the next possible string
+ String nextPoint = nextString(lastTerm.text());
+ if (nextPoint == null) { // no more possible strings can match
+ currentTerm = null;
+ endEnum = true;
+ return false;
+ }
+ // replace the old enumerator with a new one, positioned to a nice place
+ actualEnum.close();
+ actualEnum = reader.terms(lastTerm.createTerm(nextPoint));
+ }
+
+ Term candidateTerm = actualEnum.term(); // read a term
+
+ /*
+ * this means end of enumeration: no more terms for this field or no more
+ * terms at all
+ */
+ if (candidateTerm == null || candidateTerm.field() != queryTerm.field()) {
+ currentTerm = null;
+ endEnum = true;
+ return false;
+ }
+
+ // if the term matches the automaton, success!
+ if (termCompare(candidateTerm)) {
+ currentTerm = candidateTerm;
+ return true;
+ }
+ } while (true);
+ }
+
+ /**
+ * This method should only be called in linear mode, in smart
+ * mode the result is undefined, as the handling of exhausted enums
+ * is done inside {@link #next}.
+ */
+ @Override
+ protected boolean endEnum() {
+ assert linearMode : "endEnum() should only be called in linear mode";
+ return endEnum;
+ }
+
+ /**
+ * Returns the next String in lexicographic order after s that will not put
+ * the machine into a reject state. If such a string does not exist, returns
+ * null.
+ *
+ * The correctness of this method depends upon the automaton being deterministic,
+ * and having no transitions to dead states.
+ *
+ * @param s input String
+ * @return next valid String
+ */
+ private String nextString(String s) {
+ State state;
+ int pos = 0;
+
+ while (true) {
+ state = automaton.getInitialState();
+ // walk the automaton until a character is rejected.
+ for (pos = 0; pos < s.length(); pos++) {
+ State nextState = step(state, s.charAt(pos));
+ if (nextState == null)
+ break;
+ else
+ state = nextState;
+ }
+
+ // take the useful portion, and the last non-reject state, and attempt to
+ // append characters that will match.
+ String nextString = nextString(s, state, pos);
+ if (nextString != null) {
+ return cleanupPosition(nextString);
+ } else { /* no more solutions exist from this useful portion, backtrack */
+ String sprime = backtrack(s, pos);
+ if (sprime == null) /* no more solutions at all */
+ return null;
+ else if (runAutomaton.run(sprime)) /* String is good to go as-is */
+ return cleanupPosition(sprime);
+ else /* advance further */
+ s = sprime;
+ }
+ }
+ }
+
+ /**
+ * Returns the next String in lexicographic order after s that will not put
+ * the machine into a reject state. Appends some characters to the useful
+ * portion. If this cannot satisfy the machine, returns null. This method will
+ * walk the minimal path, in lexicographic order, as long as possible.
+ *
+ * @param s input String
+ * @param state current non-reject state
+ * @param useful useful portion of the string
+ * @return next valid String
+ */
+ private String nextString(String s, State state, int useful) {
+ /*
+ * the next lexicographic character must be greater than the existing
+ * character, if it exists.
+ */
+ char c = 0;
+ if (useful < s.length()) {
+ c = s.charAt(useful);
+ // if the next character is U+FFFF and is not part of the useful portion,
+ // then by definition it puts us in a reject state, and therefore this
+ // path is dead. there cannot be any higher transitions. backtrack.
+ if (c == '\uFFFF')
+ return null;
+ else
+ c++;
+ }
+
+ StringBuilder sb = new StringBuilder();
+ // append the useful portion
+ sb.append(s, 0, useful);
+
+ Set visited = new HashSet();
+ visited.add(state);
+
+ Transition transitions[] = getTransitions(state);
+
+ // find the minimal path (lexicographic order) that is >= c
+
+ for (int i = 0; i < transitions.length; i++) {
+ Transition transition = transitions[i];
+ if (transition.getMax() >= c) {
+ char nextChar = (char) Math.max(c, transition.getMin());
+ sb.append(nextChar);
+ state = transition.getDest();
+ /*
+ * as long as is possible, continue down the minimal path in
+ * lexicographic order. if a loop or accept state is encountered, stop.
+ */
+ while (!visited.contains(state) && !state.isAccept()) {
+ visited.add(state);
+ /*
+ * Note: we work with a DFA with no transitions to dead states.
+ * so the below is ok, if it is not an accept state,
+ * then there MUST be at least one transition.
+ */
+ transition = getTransitions(state)[0];
+ sb.append(transition.getMin());
+ state = transition.getDest();
+ }
+ return sb.toString();
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Backtrack thru the string after encountering a dead end.
+ *
+ * @param s input String
+ * @param useful useful portion of the string
+ * @return next valid String to evaluate against the DFA, or null
+ */
+ private String backtrack(String s, int useful) {
+ while (useful > 0) {
+ char nextChar = s.charAt(useful - 1);
+ // if a character is U+FFFF its a dead-end too,
+ // because there is no higher character in UTF-16 sort order.
+ if (nextChar != '\uFFFF') {
+ nextChar++;
+ return s.substring(0, useful - 1) + nextChar;
+ }
+ useful--;
+ }
+ return null; /* all solutions exhausted */
+ }
+
+ /**
+ * Get the cached set of transitions for a state.
+ */
+ private Transition[] getTransitions(State state) {
+ return transitionCache.get(state);
+ }
+
+ /**
+ * Step the state machine forward one character,
+ * using cached transitions.
+ */
+ private State step(State state, char c) {
+ Transition transitions[] = getTransitions(state);
+ for (int i = 0; i < transitions.length; i++)
+ if (transitions[i].getMin() <= c && c <= transitions[i].getMax())
+ return transitions[i].getDest();
+ return null;
+ }
+
+ /**
+ * if the seek position cannot be converted to valid UTF-8,
+ * then return the next valid String (in UTF-16 sort order) that
+ * can be converted to valid UTF-8.
+ */
+ private String cleanupPosition(String position) {
+ if (position != null) {
+ StringBuilder sb = new StringBuilder();
+
+ for (int i = 0; i < position.length(); i++) {
+ final char ch = position.charAt(i);
+ if (Character.isHighSurrogate(ch)) {
+ if (i + 1 < position.length()) {
+
+ final char ch2 = position.charAt(i + 1);
+ if (ch2 < Character.MIN_LOW_SURROGATE) {
+ // invalid case #1, initial or medial in term
+ // high paired with invalid low, bump the next char up to MIN_LOW
+ sb.append(ch);
+ sb.append(Character.MIN_LOW_SURROGATE);
+ return sb.toString();
+ } else if (ch2 > Character.MAX_LOW_SURROGATE) {
+ // invalid case #2, initial or medial in term
+ // high paired with invalid low, but its past the boundary.
+ // this means all supp. characters have been enumerated.
+ // ditch both the chars, replace with the first valid codepoint
+ // after the surrogate range.
+ sb.append((char)(Character.MAX_LOW_SURROGATE + 1));
+ return sb.toString();
+ } else {
+ sb.append(ch);
+ }
+
+ } else {
+ // invalid case #3, final in term
+ // unpaired high, tack on MIN_LOW
+ sb.append(ch);
+ sb.append(Character.MIN_LOW_SURROGATE);
+ return sb.toString();
+ }
+ } else if (i > 0 && Character.isLowSurrogate(ch)) {
+ final char ch1 = position.charAt(i - 1);
+ if (Character.isHighSurrogate(ch1)) {
+ sb.append(ch);
+ } else {
+ // invalid case #4, medial unpaired low. bump past the boundary.
+ sb.append((char)(Character.MAX_LOW_SURROGATE + 1));
+ return sb.toString();
+ }
+ } else if (Character.isLowSurrogate(ch)){
+ // invalid case #5, initial unpaired low. bump past the boundary.
+ sb.append((char)(Character.MAX_LOW_SURROGATE + 1));
+ return sb.toString();
+ } else {
+ sb.append(ch);
+ }
+ }
+ return sb.toString();
+ } else
+ return null;
+ }
+
+ /**
+ * if the suffix starts with a low surrogate, remove it.
+ * This won't be quite as efficient, but can be converted to valid UTF-8
+ *
+ * This isn't nearly as complex as cleanupPosition, because its not
+ * going to use this suffix to walk any path thru the terms.
+ *
+ */
+ private String cleanupSuffix(String suffix) {
+ if (suffix != null && suffix.length() > 0 &&
+ Character.isLowSurrogate(suffix.charAt(0)))
+ return suffix.substring(1);
+ else
+ return suffix;
+ }
+}
Property changes on: src\java\org\apache\lucene\search\AutomatonTermEnum.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/search/AutomatonTermsEnum.java
===================================================================
--- src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 0)
+++ src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 0)
@@ -0,0 +1,467 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermRef;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.Terms;
+
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.RunAutomaton;
+import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.automaton.SpecialOperations;
+
+/**
+ * A FilteredTermsEnum that enumerates terms based upon what is accepted by a
+ * DFA.
+ *
+ * The algorithm is such:
+ *
+ * - As long as matches are successful, keep reading sequentially.
+ *
- When a match fails, skip to the next string in lexicographic order that
+ * does not enter a reject state.
+ *
+ *
+ * The algorithm does not attempt to actually skip to the next string that is
+ * completely accepted. This is not possible when the language accepted by the
+ * FSM is not finite (i.e. * operator).
+ *
+ *
+ * If the DFA has a leading kleene star, or something similar, it will
+ * need to run against the entire term dictionary. In this case its much
+ * better to do just that than to use smart enumeration.
+ * This heuristic looks for an initial loop, with a range of at least 1/3
+ * of the unicode BMP.
+ * Use {@link #usesLinearMode} to find out if it enumerates all terms
+ * in linear mode without seeking.
+ *
+ *
+ * WARNING: The status of the Automaton feature is
+ * experimental. The APIs introduced here might change in the future and will
+ * not be supported anymore in such a case.
+ *
+ */
+public class AutomatonTermsEnum extends FilteredTermsEnum {
+ // the object-oriented form of the DFA
+ private final Automaton automaton;
+ // a tableized array-based form of the DFA
+ private final RunAutomaton runAutomaton;
+ // true if this enum will not seek around
+ private final boolean linearMode;
+ // common suffix of the automaton
+ private final TermRef commonSuffixRef;
+ // true if the automaton accepts a finite language
+ private final boolean finite;
+ // for complex machines that must make a lot of comparisons
+ private final Map transitionCache;
+ // used for unicode conversion from TermRef byte[] to char[]
+ private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+ // the reference used for seeking forwards through the term dictionary
+ private final TermRef seekTermRef = new TermRef();
+ // the field being enumerated
+ private final String field;
+
+ private boolean uninitialized = true;
+
+ // this accept stati will be returned by accept() dependent on internal mode
+ private final AcceptStatus NO_MATCH, YES_MATCH;
+
+ /**
+ * Construct an enumerator based upon an automaton, enumerating the specified
+ * field, working on a supplied reader.
+ *
+ * The parameter linearMode determines whether or not it will use smart enumeration.
+ */
+ AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader, boolean linearMode)
+ throws IOException {
+ super(reader, queryTerm.field());
+ this.automaton = automaton;
+ field = queryTerm.field();
+ this.linearMode = linearMode;
+
+ /*
+ * tableize the automaton. this also ensures it is deterministic, and has no
+ * transitions to dead states.
+ */
+ runAutomaton = new RunAutomaton(this.automaton);
+
+ if (this.linearMode) {
+ // iterate all terms in linear mode
+ this.finite = false;
+ transitionCache = null;
+ commonSuffixRef = new TermRef(cleanupSuffix(SpecialOperations.getCommonSuffix(automaton)));
+ NO_MATCH = AcceptStatus.NO;
+ YES_MATCH = AcceptStatus.YES;
+ } else {
+ // if the automaton is finite, we will never read sequentially, but always seek.
+ this.finite = SpecialOperations.isFinite(this.automaton);
+ // in nonlinear mode, the common suffix isn't that helpful.
+ // we will seek each time anyway (and take the unicode conversion hit).
+ // its also currently expensive to calculate, because getCommonSuffix is
+ // a bit expensive.
+ commonSuffixRef = new TermRef("");
+ // build a cache of sorted transitions for every state
+ transitionCache = new HashMap(runAutomaton.getSize());
+ for (org.apache.lucene.util.automaton.State state : this.automaton.getStates()) {
+ List transitions = state.getSortedTransitions(false);
+ transitionCache.put(state, transitions.toArray(new Transition[transitions.size()]));
+ }
+
+ NO_MATCH = AcceptStatus.NO_AND_SEEK;
+ YES_MATCH = finite ? AcceptStatus.YES_AND_SEEK : AcceptStatus.YES;
+ }
+ }
+
+ /**
+ * Construct an enumerator based upon an automaton, enumerating the specified
+ * field, working on a supplied reader.
+ *
+ * It will automagically determine whether or not to enumerate the term dictionary
+ * in a smart way, or to just do a linear scan depending upon a heuristic.
+ */
+ public AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader)
+ throws IOException {
+ this(automaton, queryTerm, reader, AutomatonTermsEnum.isSlow(automaton));
+ }
+
+ /**
+ * Heuristic to detect if an automaton will be so slow,
+ * that it is better to do a linear enumeration.
+ *
+ * A very slow automaton will simply cause a lot of wasted disk seeks.
+ * Instead in that case it is actually faster to do a linear enumeration.
+ *
+ * @param automaton automaton
+ * @return true if it will result in bad search performance
+ */
+ private static boolean isSlow(Automaton automaton) {
+ /*
+ * If the DFA has a leading kleene star, or something similar, it will
+ * need to run against the entire term dictionary. In this case its much
+ * better to do just that than to use smart enumeration.
+ *
+ * this heuristic looks for an initial loop, with a range of at least 1/3
+ * of the unicode BMP.
+ */
+ org.apache.lucene.util.automaton.State initialState = automaton.getInitialState();
+ boolean linearMode = false;
+ for (Transition transition : initialState.getTransitions()) {
+ if (transition.getDest() == initialState &&
+ (transition.getMax() - transition.getMin()) > (Character.MAX_VALUE / 3)) {
+ linearMode = true;
+ break;
+ }
+ }
+ return linearMode;
+ }
+
+ /**
+ * Returns {@code true} if the enum is in linear mode, {@code false} in smart mode.
+ */
+ public final boolean usesLinearMode() {
+ return linearMode;
+ }
+
+ @Override
+ public float difference() {
+ return 1.0f;
+ }
+
+ /**
+ * Returns true if the term matches the automaton. Also stashes away the term
+ * to assist with smart enumeration.
+ *
In linear mode, it also sets {@link #endEnum} if the enumeration is exhausted.
+ * In smart mode, it will never do this.
+ */
+ @Override
+ protected AcceptStatus accept(final TermRef term) {
+ if (term.endsWith(commonSuffixRef)) {
+ UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+ return runAutomaton.run(utf16.result, 0, utf16.length) ? YES_MATCH : NO_MATCH;
+ } else {
+ return NO_MATCH;
+ }
+ }
+
+ @Override
+ protected TermRef nextSeekTerm(final boolean enumExhausted) throws IOException {
+ if (enumExhausted)
+ return null;
+ if (uninitialized) {
+ uninitialized = false;
+ // return the first seek term
+ if (linearMode) {
+ seekTermRef.copy("");
+ } else {
+ final String firstPoint = nextString("");
+ if (firstPoint == null)
+ return null;
+ seekTermRef.copy(firstPoint);
+ }
+ return seekTermRef;
+ } else if (!linearMode) {
+ // seek to the next possible string
+ final String nextPoint = nextString(tenum.term().toString());
+ if (nextPoint != null) {
+ // reposition
+ seekTermRef.copy(nextPoint);
+ return seekTermRef;
+ }
+ }
+ // no more possible strings can match
+ return null;
+ }
+
+ /**
+ * Returns the next String in lexicographic order after s that will not put
+ * the machine into a reject state. If such a string does not exist, returns
+ * null.
+ *
+ * The correctness of this method depends upon the automaton being deterministic,
+ * and having no transitions to dead states.
+ *
+ * @param s input String
+ * @return next valid String
+ */
+ private String nextString(String s) {
+ org.apache.lucene.util.automaton.State state;
+ int pos = 0;
+
+ while (true) {
+ state = automaton.getInitialState();
+ // walk the automaton until a character is rejected.
+ for (pos = 0; pos < s.length(); pos++) {
+ org.apache.lucene.util.automaton.State nextState = step(state, s.charAt(pos));
+ if (nextState == null)
+ break;
+ else
+ state = nextState;
+ }
+
+ // take the useful portion, and the last non-reject state, and attempt to
+ // append characters that will match.
+ String nextString = nextString(s, state, pos);
+ if (nextString != null) {
+ return cleanupPosition(nextString);
+ } else { /* no more solutions exist from this useful portion, backtrack */
+ String sprime = backtrack(s, pos);
+ if (sprime == null) /* no more solutions at all */
+ return null;
+ else if (runAutomaton.run(sprime)) /* String is good to go as-is */
+ return cleanupPosition(sprime);
+ else /* advance further */
+ s = sprime;
+ }
+ }
+ }
+
+ /**
+ * Returns the next String in lexicographic order after s that will not put
+ * the machine into a reject state. Appends some characters to the useful
+ * portion. If this cannot satisfy the machine, returns null. This method will
+ * walk the minimal path, in lexicographic order, as long as possible.
+ *
+ * @param s input String
+ * @param state current non-reject state
+ * @param useful useful portion of the string
+ * @return next valid String
+ */
+ private String nextString(String s, org.apache.lucene.util.automaton.State state, int useful) {
+ /*
+ * the next lexicographic character must be greater than the existing
+ * character, if it exists.
+ */
+ char c = 0;
+ if (useful < s.length()) {
+ c = s.charAt(useful);
+ // if the next character is U+FFFF and is not part of the useful portion,
+ // then by definition it puts us in a reject state, and therefore this
+ // path is dead. there cannot be any higher transitions. backtrack.
+ if (c == '\uFFFF')
+ return null;
+ else
+ c++;
+ }
+
+ StringBuilder sb = new StringBuilder();
+ // append the useful portion
+ sb.append(s, 0, useful);
+
+ Set visited = new HashSet();
+ visited.add(state);
+
+ Transition transitions[] = getTransitions(state);
+
+ // find the minimal path (lexicographic order) that is >= c
+
+ for (int i = 0; i < transitions.length; i++) {
+ Transition transition = transitions[i];
+ if (transition.getMax() >= c) {
+ char nextChar = (char) Math.max(c, transition.getMin());
+ sb.append(nextChar);
+ state = transition.getDest();
+ /*
+ * as long as is possible, continue down the minimal path in
+ * lexicographic order. if a loop or accept state is encountered, stop.
+ */
+ while (!visited.contains(state) && !state.isAccept()) {
+ visited.add(state);
+ /*
+ * Note: we work with a DFA with no transitions to dead states.
+ * so the below is ok, if it is not an accept state,
+ * then there MUST be at least one transition.
+ */
+ transition = getTransitions(state)[0];
+ sb.append(transition.getMin());
+ state = transition.getDest();
+ }
+ return sb.toString();
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Backtrack thru the string after encountering a dead end.
+ *
+ * @param s input String
+ * @param useful useful portion of the string
+ * @return next valid String to evaluate against the DFA, or null
+ */
+ private String backtrack(String s, int useful) {
+ while (useful > 0) {
+ char nextChar = s.charAt(useful - 1);
+ // if a character is U+FFFF its a dead-end too,
+ // because there is no higher character in UTF-16 sort order.
+ if (nextChar != '\uFFFF') {
+ nextChar++;
+ return s.substring(0, useful - 1) + nextChar;
+ }
+ useful--;
+ }
+ return null; /* all solutions exhausted */
+ }
+
+ /**
+ * Get the cached set of transitions for a state.
+ */
+ private Transition[] getTransitions(org.apache.lucene.util.automaton.State state) {
+ return transitionCache.get(state);
+ }
+
+ /**
+ * Step the state machine forward one character,
+ * using cached transitions.
+ */
+ private org.apache.lucene.util.automaton.State step(
+ org.apache.lucene.util.automaton.State state, char c) {
+ Transition transitions[] = getTransitions(state);
+ for (int i = 0; i < transitions.length; i++)
+ if (transitions[i].getMin() <= c && c <= transitions[i].getMax())
+ return transitions[i].getDest();
+ return null;
+ }
+
+ /**
+ * if the seek position cannot be converted to valid UTF-8,
+ * then return the next valid String (in UTF-16 sort order) that
+ * can be converted to valid UTF-8.
+ */
+ private String cleanupPosition(String position) {
+ if (position != null) {
+ StringBuilder sb = new StringBuilder();
+
+ for (int i = 0; i < position.length(); i++) {
+ final char ch = position.charAt(i);
+ if (Character.isHighSurrogate(ch)) {
+ if (i + 1 < position.length()) {
+
+ final char ch2 = position.charAt(i + 1);
+ if (ch2 < Character.MIN_LOW_SURROGATE) {
+ // invalid case #1, initial or medial in term
+ // high paired with invalid low, bump the next char up to MIN_LOW
+ sb.append(ch);
+ sb.append(Character.MIN_LOW_SURROGATE);
+ return sb.toString();
+ } else if (ch2 > Character.MAX_LOW_SURROGATE) {
+ // invalid case #2, initial or medial in term
+ // high paired with invalid low, but its past the boundary.
+ // this means all supp. characters have been enumerated.
+ // ditch both the chars, replace with the first valid codepoint
+ // after the surrogate range.
+ sb.append((char)(Character.MAX_LOW_SURROGATE + 1));
+ return sb.toString();
+ } else {
+ sb.append(ch);
+ }
+
+ } else {
+ // invalid case #3, final in term
+ // unpaired high, tack on MIN_LOW
+ sb.append(ch);
+ sb.append(Character.MIN_LOW_SURROGATE);
+ return sb.toString();
+ }
+ } else if (i > 0 && Character.isLowSurrogate(ch)) {
+ final char ch1 = position.charAt(i - 1);
+ if (Character.isHighSurrogate(ch1)) {
+ sb.append(ch);
+ } else {
+ // invalid case #4, medial unpaired low. bump past the boundary.
+ sb.append((char)(Character.MAX_LOW_SURROGATE + 1));
+ return sb.toString();
+ }
+ } else if (Character.isLowSurrogate(ch)){
+ // invalid case #5, initial unpaired low. bump past the boundary.
+ sb.append((char)(Character.MAX_LOW_SURROGATE + 1));
+ return sb.toString();
+ } else {
+ sb.append(ch);
+ }
+ }
+ return sb.toString();
+ } else
+ return null;
+ }
+
+ /**
+ * if the suffix starts with a low surrogate, remove it.
+ * This won't be quite as efficient, but can be converted to valid UTF-8
+ *
+ * This isn't nearly as complex as cleanupPosition, because its not
+ * going to use this suffix to walk any path thru the terms.
+ *
+ */
+ private String cleanupSuffix(String suffix) {
+ if (suffix != null && suffix.length() > 0 &&
+ Character.isLowSurrogate(suffix.charAt(0)))
+ return suffix.substring(1);
+ else
+ return suffix;
+ }
+}
Property changes on: src\java\org\apache\lucene\search\AutomatonTermsEnum.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/search/EmptyTermsEnum.java
===================================================================
--- src/java/org/apache/lucene/search/EmptyTermsEnum.java (revision 0)
+++ src/java/org/apache/lucene/search/EmptyTermsEnum.java (revision 0)
@@ -0,0 +1,62 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.TermRef;
+import org.apache.lucene.index.TermsEnum;
+
+/**
+ * Subclass of {@code FilteredTermsEnum} that is always empty.
+ *
+ * This can be used by {@link MultiTermQuery}s (if no terms can ever match the query),
+ * but want to preserve MultiTermQuery semantics such as
+ * {@link MultiTermQuery#rewriteMethod}.
+ */
+public final class EmptyTermsEnum extends FilteredTermsEnum {
+
+ /**
+ * Creates a new EmptyTermsEnum.
+ */
+ public EmptyTermsEnum(final String field) {
+ super((TermsEnum) null, field);
+ }
+
+ @Override
+ /** Always returns {@link AcceptStatus#END}. */
+ protected AcceptStatus accept(TermRef term) {
+ return AcceptStatus.END;
+ }
+
+ @Override
+ public float difference() {
+ return 1.0F;
+ }
+
+ /** Always returns {@link SeekStatus#END}. */
+ @Override
+ public SeekStatus seek(TermRef term) {
+ return SeekStatus.END;
+ }
+
+ /** Always returns {@link SeekStatus#END}. */
+ @Override
+ public SeekStatus seek(long ord) {
+ return SeekStatus.END;
+ }
+
+}
Property changes on: src\java\org\apache\lucene\search\EmptyTermsEnum.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/search/FilteredTermsEnum.java
===================================================================
--- src/java/org/apache/lucene/search/FilteredTermsEnum.java (revision 887534)
+++ src/java/org/apache/lucene/search/FilteredTermsEnum.java (working copy)
@@ -18,153 +18,218 @@
*/
import java.io.IOException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermRef;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Bits;
/**
* Abstract class for enumerating a subset of all terms.
- *
- *
On creation, the enumerator must already be positioned
- * to the first term.
*
- * Term enumerations are always ordered by
- * Term.compareTo(). Each term in the enumeration is
+ *
Term enumerations should be always ordered by
+ * {@link #getTermComparator}. Each term in the enumeration is
* greater than all that precede it.
-*/
+ * This enum cannot guarantee this, if you use seeking mode
+ * (override {@link #nextSeekTerm}) and provide
+ * terms to seek out of order and not greater
+ * than the current term. All {@code FilteredTermsEnum}
+ * in Lucene Core support this.
+ * For {@link MultiTermQuery}, the order is not
+ * important, but public subclasses should be ordered.
+ */
public abstract class FilteredTermsEnum extends TermsEnum {
- protected static enum AcceptStatus {YES, NO, END};
+ private final String field;
- /** the delegate enum - to set this member use {@link #setEnum} */
- protected TermsEnum actualEnum;
+ private TermRef initialSeekTerm = null;
+ private boolean doSeek = true, exhausted = false;
+
+ protected final TermsEnum tenum;
+
+ /** Return value, if term should be accepted or the iteration should
+ * {@code END}. The {@code *_SEEK} values denote, that after handling the current term
+ * the enum should call {@link nextSeekTerm()} and step forward.
+ * @see #accept(TermRef)
+ */
+ protected static enum AcceptStatus {YES, YES_AND_SEEK, NO, NO_AND_SEEK, END};
+
+ /** Return if term is accepted, not accepted or the iteration should ended
+ * (and possibly seek).
+ */
+ protected abstract AcceptStatus accept(TermRef term) throws IOException;
- /** Return true if term is accepted */
- protected abstract AcceptStatus accept(TermRef term);
-
- /** Equality measure on the term */
- public abstract float difference();
+ /** Equality measure on the term, used by {@link FuzzyQuery} and
+ * scoring {@link MultiTermQuery}. */
+ public abstract float difference() throws IOException;
- public abstract String field();
+ /**
+ * Creates a filtered {@link TermsEnum} for the given field name and reader.
+ */
+ public FilteredTermsEnum(final IndexReader reader, final String field) throws IOException {
+ this.field = field;
+ final Terms terms = reader.fields().terms(field);
+ tenum = (terms != null) ? terms.iterator() : null;
+ }
- /** Only called once, right after construction, to check
- * whether there are no matching terms */
- public abstract boolean empty();
+ /**
+ * Creates a filtered {@link TermsEnum} on a terms enum for the given field name.
+ * @param tenum the terms enumeration to filter, if {@code null} this is the null iterator.
+ * @param field the field name this enum operates on (needed by {@link MultiTermQuery}).
+ */
+ public FilteredTermsEnum(final TermsEnum tenum, final String field) {
+ this.tenum = tenum;
+ this.field = field;
+ }
/**
- * use this method to set the actual TermsEnum (e.g. in ctor),
- * it will be automatically positioned on the first
- * accepted term, and returns the term found or null if
- * there is no matching term.
+ * Use this method to set the initial {@link TermRef}
+ * to seek before iterating. This is a convenience method for
+ * subclasses that do not override {@link #nextSeekTerm}.
+ * If the initial seek term is {@code null} (default),
+ * the enum is empty.
+ *
You can only use this method, if you keep the default
+ * implementation of {@link #nextSeekTerm}.
*/
- protected TermRef setEnum(TermsEnum actualEnum, TermRef term) throws IOException {
- this.actualEnum = actualEnum;
+ protected final void setInitialSeekTerm(TermRef term) throws IOException {
+ this.initialSeekTerm = term;
+ }
+
+ /** On the first call to {@link #next} or if {@link #accept} returns
+ * {@link AcceptStatus#YES_AND_SEEK} or {@link AcceptStatus#NO_AND_SEEK},
+ * this method will be called to eventually seek the underlying TermsEnum
+ * to a new position.
+ * This method returns per default only one time the initial seek term
+ * and then {@code null}, so no repositioning is ever done.
+ *
Override this method, if you want a more sophisticated TermsEnum,
+ * that repositions the iterator during enumeration.
+ * If the {@code enumExhausted} parameter is {@code true}, the underlying
+ * enumeration is already exhausted and you do not need to return further terms
+ * (see below).
+ * If this method always returns {@code null} the enum is empty.
+ *
Please note: This method should always provide a greater term
+ * than the last enumerated term, else the behaviour of this enum
+ * violates the contract for TermsEnums. So you are allowed to return new
+ * terms for {@code enumExhausted == true}, but that would seek eventually
+ * backwards.
+ */
+ protected TermRef nextSeekTerm(final boolean enumExhausted) throws IOException {
+ if (enumExhausted)
+ return null;
+ final TermRef t = initialSeekTerm;
+ initialSeekTerm = null;
+ return t;
+ }
- // Find the first term that matches
- if (term != null) {
- SeekStatus status = actualEnum.seek(term);
- if (status == SeekStatus.END) {
- return null;
- } else {
- AcceptStatus s = accept(actualEnum.term());
- if (s == AcceptStatus.NO) {
- return next();
- } else if (s == AcceptStatus.END) {
- return null;
- } else {
- return actualEnum.term();
- }
- }
- } else {
- return next();
- }
+ /** returns the field this TermsEnum is working on */
+ public final String field() {
+ return field;
}
+ /**
+ * Returns the related attributes, the returned {@link AttributeSource}
+ * is shared with the delegate {@code TermsEnum}.
+ */
@Override
+ public AttributeSource attributes() {
+ /* if we have no tenum, we return a new attributes instance,
+ * to prevent NPE in subclasses that use attributes.
+ * in all other cases we share the attributes with our delegate. */
+ return (tenum == null) ? super.attributes() : tenum.attributes();
+ }
+
+ @Override
public TermRef term() throws IOException {
- if(actualEnum == null) {
- return null;
- }
- return actualEnum.term();
+ return (tenum == null) ? null : tenum.term();
}
@Override
- /** Don't call this until after setEnum, else you'll hit NPE */
public TermRef.Comparator getTermComparator() throws IOException {
- return actualEnum.getTermComparator();
+ return (tenum == null) ? null : tenum.getTermComparator();
}
- /**
- * Returns the docFreq of the current Term in the enumeration.
- * Returns -1 if no Term matches or all terms have been enumerated.
- */
@Override
public int docFreq() {
- assert actualEnum != null;
- return actualEnum.docFreq();
+ return (tenum == null) ? -1 : tenum.docFreq();
}
-
- /** Increments the enumeration to the next element.
- * Non-null if one exists, or null if it's the end. */
- @Override
- public TermRef next() throws IOException {
- assert actualEnum != null;
- while (true) {
- TermRef term = actualEnum.next();
- if (term != null) {
- AcceptStatus s = accept(term);
- if (s == AcceptStatus.YES) {
- return term;
- } else if (s == AcceptStatus.END) {
- // end
- return null;
- }
- } else {
- // end
- return null;
- }
- }
- }
+ /** This enum does not support seeking!
+ * @throws UnsupportedOperationException
+ */
@Override
public SeekStatus seek(TermRef term) throws IOException {
- return finishSeek(actualEnum.seek(term));
+ throw new UnsupportedOperationException(getClass().getName()+" does not support seeking");
}
+ /** This enum does not support seeking!
+ * @throws UnsupportedOperationException
+ */
@Override
public SeekStatus seek(long ord) throws IOException {
- return finishSeek(actualEnum.seek(ord));
+ throw new UnsupportedOperationException(getClass().getName()+" does not support seeking");
}
- private SeekStatus finishSeek(SeekStatus status) throws IOException {
- if (status != SeekStatus.END) {
- TermRef term = actualEnum.term();
- final AcceptStatus s = accept(term);
- if (s == AcceptStatus.NO) {
- term = next();
- if (term == null) {
- return SeekStatus.END;
- } else {
- return SeekStatus.NOT_FOUND;
- }
- } else if (s == AcceptStatus.END) {
- return SeekStatus.END;
- } else {
- return status;
- }
- } else {
- return status;
- }
- }
-
@Override
public long ord() throws IOException {
- return actualEnum.ord();
+ return (tenum == null) ? -1 : tenum.ord();
}
@Override
public DocsEnum docs(Bits bits) throws IOException {
- return actualEnum.docs(bits);
+ return (tenum == null) ? null : tenum.docs(bits);
}
+
+ @Override
+ public TermRef next() throws IOException {
+ if (exhausted || tenum == null)
+ return null;
+ boolean delegateExhausted = false;
+ for (;;) {
+ // Seek or forward the iterator
+ final TermRef term;
+ if (doSeek) {
+ final TermRef t = nextSeekTerm(delegateExhausted);
+ if (t == null) {
+ // no more terms to seek we must end now!
+ exhausted = true;
+ return null;
+ }
+ if (tenum.seek(t) == SeekStatus.END) {
+ // enum exhausted, seek to next one
+ delegateExhausted = true;
+ continue;
+ }
+ delegateExhausted = doSeek = false;
+ term = tenum.term();
+ } else {
+ term = tenum.next();
+ if (term == null) {
+ // enum exhausted
+ delegateExhausted = doSeek = true;
+ continue;
+ }
+ }
+
+ // check if term is accepted
+ switch (accept(term)) {
+ case YES_AND_SEEK:
+ doSeek = true;
+ // term accepted, but we need to seek so fall-through
+ case YES:
+ // term accepted
+ return term;
+ case NO_AND_SEEK:
+ // invalid term, seek next time
+ doSeek = true;
+ break;
+ case END:
+ // we are supposed to end the enum
+ exhausted = true;
+ return null;
+ }
+ }
+ }
+
}
Index: src/java/org/apache/lucene/search/FuzzyTermsEnum.java
===================================================================
--- src/java/org/apache/lucene/search/FuzzyTermsEnum.java (revision 887534)
+++ src/java/org/apache/lucene/search/FuzzyTermsEnum.java (working copy)
@@ -19,7 +19,6 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermRef;
import java.io.IOException;
@@ -27,8 +26,9 @@
/** Subclass of FilteredTermEnum for enumerating all terms that are similar
* to the specified filter term.
*
- *
Term enumerations are always ordered by Term.compareTo(). Each term in
- * the enumeration is greater than all that precede it.
+ *
Term enumerations are always ordered by
+ * {@link #getTermComparator}. Each term in the enumeration is
+ * greater than all that precede it.
*/
public final class FuzzyTermsEnum extends FilteredTermsEnum {
@@ -44,10 +44,8 @@
private int[][] d;
private float similarity;
- private final boolean empty;
private Term searchTerm;
- private final String field;
private final String text;
private final String prefix;
@@ -102,7 +100,7 @@
* @throws IOException
*/
public FuzzyTermsEnum(IndexReader reader, Term term, final float minSimilarity, final int prefixLength) throws IOException {
- super();
+ super(reader, term.field());
if (minSimilarity >= 1.0f)
throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal to 1");
@@ -114,7 +112,6 @@
this.minimumSimilarity = minSimilarity;
this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
this.searchTerm = term;
- this.field = searchTerm.field();
//The prefix could be longer than the word.
//It's kind of silly though. It means we must match the entire word.
@@ -127,21 +124,11 @@
initializeMaxDistances();
this.d = initDistanceArray();
- Terms terms = reader.fields().terms(field);
- if (terms != null) {
- empty = setEnum(terms.iterator(), prefixTermRef) == null;
- } else {
- empty = false;
- }
+ setInitialSeekTerm(prefixTermRef);
}
private final TermRef prefixTermRef;
- @Override
- public String field() {
- return field;
- }
-
/**
* The termCompare method in FuzzyTermEnum uses Levenshtein distance to
* calculate the distance between the given term and the comparing term.
@@ -163,11 +150,6 @@
return (float)((similarity - minimumSimilarity) * scale_factor);
}
- @Override
- public final boolean empty() {
- return empty;
- }
-
/******************************
* Compute Levenshtein distance
******************************/
Index: src/java/org/apache/lucene/search/MultiTermQuery.java
===================================================================
--- src/java/org/apache/lucene/search/MultiTermQuery.java (revision 887534)
+++ src/java/org/apache/lucene/search/MultiTermQuery.java (working copy)
@@ -103,22 +103,17 @@
// nocommit -- if no terms we'd want to return NullQuery
BooleanQuery result = new BooleanQuery(true);
- if (!termsEnum.empty()) {
- final String field = termsEnum.field();
- assert field != null;
- int count = 0;
- TermRef term = termsEnum.term();
- // first term must exist since termsEnum wasn't null
- assert term != null;
- do {
- TermQuery tq = new TermQuery(new Term(field, term.toString())); // found a match
- tq.setBoost(query.getBoost() * termsEnum.difference()); // set the boost
- result.add(tq, BooleanClause.Occur.SHOULD); // add to query
- count++;
- term = termsEnum.next();
- } while(term != null);
- query.incTotalNumberOfTerms(count);
+ final String field = termsEnum.field();
+ assert field != null;
+ int count = 0;
+ TermRef term;
+ while ((term = termsEnum.next()) != null) {
+ TermQuery tq = new TermQuery(new Term(field, term.toString())); // found a match
+ tq.setBoost(query.getBoost() * termsEnum.difference()); // set the boost
+ result.add(tq, BooleanClause.Occur.SHOULD); // add to query
+ count++;
}
+ query.incTotalNumberOfTerms(count);
return result;
} else {
// deprecated case
@@ -167,9 +162,14 @@
private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable {
@Override
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
- // strip the scores off
- Query result = new ConstantScoreQuery(new QueryWrapperFilter(super.rewrite(reader, query)));
- result.setBoost(query.getBoost());
+ Query result = super.rewrite(reader, query);
+ assert result instanceof BooleanQuery;
+ // nocommit: if empty boolean query return NullQuery
+ if (!((BooleanQuery) result).clauses().isEmpty()) {
+ // strip the scores off
+ result = new ConstantScoreQuery(new QueryWrapperFilter(result));
+ result.setBoost(query.getBoost());
+ }
return result;
}
@@ -248,54 +248,53 @@
// exhaust the enum before hitting either of the
// cutoffs, we use ConstantBooleanQueryRewrite; else,
// ConstantFilterRewrite:
- final Collection pendingTerms = new ArrayList();
- final Collection oldApiPendingTerms = new ArrayList();
final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
int docVisitCount = 0;
FilteredTermsEnum termsEnum = query.getTermsEnum(reader);
if (termsEnum != null) {
- if (!termsEnum.empty()) {
- final String field = termsEnum.field();
- assert field != null;
- TermRef term = termsEnum.term();
- // first term must exist since termsEnum wasn't null
- assert term != null;
- do {
- pendingTerms.add((TermRef) term.clone());
- if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
- // Too many terms -- cut our losses now and make a filter.
- Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query));
- result.setBoost(query.getBoost());
- return result;
- }
- // Loading the TermInfo from the terms dict here
- // should not be costly, because 1) the
- // query/filter will load the TermInfo when it
- // runs, and 2) the terms dict has a cache:
- docVisitCount += reader.docFreq(field, term);
- term = termsEnum.next();
- } while(term != null);
-
- // Enumeration is done, and we hit a small
- // enough number of terms & docs -- just make a
- // BooleanQuery, now
+ final Collection pendingTerms = new ArrayList();
+ final String field = termsEnum.field();
+ assert field != null;
+ TermRef term;
+ while ((term = termsEnum.next()) != null) {
+ pendingTerms.add((TermRef) term.clone());
+ if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
+ // Too many terms -- cut our losses now and make a filter.
+ Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query));
+ result.setBoost(query.getBoost());
+ return result;
+ }
+ // Loading the TermInfo from the terms dict here
+ // should not be costly, because 1) the
+ // query/filter will load the TermInfo when it
+ // runs, and 2) the terms dict has a cache:
+ docVisitCount += reader.docFreq(field, term);
+ }
+
+ // Enumeration is done, and we hit a small
+ // enough number of terms & docs -- just make a
+ // BooleanQuery, now
+
+ // nocommit: if pendingTerms.size()==0 return NullQuery
+ final Query result;
+ if (pendingTerms.isEmpty()) {
+ result = new BooleanQuery(true);
+ } else {
BooleanQuery bq = new BooleanQuery(true);
for(TermRef termRef : pendingTerms) {
TermQuery tq = new TermQuery(new Term(field, termRef.toString()));
bq.add(tq, BooleanClause.Occur.SHOULD);
}
// Strip scores
- Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
+ result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
result.setBoost(query.getBoost());
- query.incTotalNumberOfTerms(pendingTerms.size());
- return result;
- } else {
- // nocommit -- need NullQuery here
- return new BooleanQuery();
}
+ query.incTotalNumberOfTerms(pendingTerms.size());
+ return result;
} else {
+ final Collection pendingTerms = new ArrayList();
// deprecated case
FilteredTermEnum enumerator = query.getEnum(reader);
@@ -303,7 +302,7 @@
while(true) {
Term t = enumerator.term();
if (t != null) {
- oldApiPendingTerms.add(t);
+ pendingTerms.add(t);
// Loading the TermInfo from the terms dict here
// should not be costly, because 1) the
// query/filter will load the TermInfo when it
@@ -313,21 +312,26 @@
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
// Too many terms -- make a filter.
- Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query));
+ Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query));
result.setBoost(query.getBoost());
return result;
} else if (!enumerator.next()) {
// Enumeration is done, and we hit a small
// enough number of terms & docs -- just make a
// BooleanQuery, now
- BooleanQuery bq = new BooleanQuery(true);
- for (final Term term: oldApiPendingTerms) {
- TermQuery tq = new TermQuery(term);
- bq.add(tq, BooleanClause.Occur.SHOULD);
+ final Query result;
+ if (pendingTerms.isEmpty()) {
+ result = new BooleanQuery(true);
+ } else {
+ BooleanQuery bq = new BooleanQuery(true);
+ for(Term term : pendingTerms) {
+ TermQuery tq = new TermQuery(term);
+ bq.add(tq, BooleanClause.Occur.SHOULD);
+ }
+ // Strip scores
+ result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
+ result.setBoost(query.getBoost());
}
- // Strip scores
- Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
- result.setBoost(query.getBoost());
query.incTotalNumberOfTerms(pendingTerms.size());
return result;
}
Index: src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java
===================================================================
--- src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (revision 887534)
+++ src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (working copy)
@@ -106,14 +106,14 @@
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
final FilteredTermsEnum termsEnum = query.getTermsEnum(reader);
if (termsEnum != null) {
- if (!termsEnum.empty()) {
+ if (termsEnum.next() != null) {
// fill into a OpenBitSet
final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
final int[] docs = new int[32];
final int[] freqs = new int[32];
int termCount = 0;
final Bits delDocs = reader.getDeletedDocs();
- while (true) {
+ do {
termCount++;
// System.out.println(" iter termCount=" + termCount + " term=" +
// enumerator.term().toBytesString());
@@ -128,13 +128,7 @@
break;
}
}
- TermRef term = termsEnum.next();
- if (term == null) {
- break;
- }
- // System.out.println(" enum next term=" + term.toBytesString());
- assert term.termEquals(termsEnum.term());
- }
+ } while (termsEnum.next() != null);
// System.out.println(" done termCount=" + termCount);
query.incTotalNumberOfTerms(termCount);
Index: src/java/org/apache/lucene/search/NumericRangeQuery.java
===================================================================
--- src/java/org/apache/lucene/search/NumericRangeQuery.java (revision 887534)
+++ src/java/org/apache/lucene/search/NumericRangeQuery.java (working copy)
@@ -28,8 +28,6 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermRef;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
/**
* A {@link Query} that matches numeric values within a
@@ -163,7 +161,7 @@
assert (valSize == 32 || valSize == 64);
if (precisionStep < 1)
throw new IllegalArgumentException("precisionStep must be >=1");
- this.field = StringHelper.intern(field);
+ this.field = field;
this.precisionStep = precisionStep;
this.valSize = valSize;
this.min = min;
@@ -303,9 +301,12 @@
return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 32, min, max, minInclusive, maxInclusive);
}
- @Override
+ @Override @SuppressWarnings("unchecked")
protected FilteredTermsEnum getTermsEnum(final IndexReader reader) throws IOException {
- return new NumericRangeTermsEnum(reader);
+ // very strange: java.lang.Number itsself is not Comparable, but all subclasses used here are
+ return (min != null && max != null && ((Comparable) min).compareTo(max) > 0) ?
+ new EmptyTermsEnum(field) :
+ new NumericRangeTermsEnum(reader);
}
/** Returns the field name for this query */
@@ -344,7 +345,7 @@
if (o instanceof NumericRangeQuery) {
final NumericRangeQuery q=(NumericRangeQuery)o;
return (
- field==q.field &&
+ field.equals(q.field) &&
(q.min == null ? min == null : q.min.equals(min)) &&
(q.max == null ? max == null : q.max.equals(max)) &&
minInclusive == q.minInclusive &&
@@ -365,15 +366,9 @@
(Boolean.valueOf(minInclusive).hashCode()^0x14fa55fb)+
(Boolean.valueOf(maxInclusive).hashCode()^0x733fa5fe);
}
-
- // field must be interned after reading from stream
- private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException {
- in.defaultReadObject();
- field = StringHelper.intern(field);
- }
// members (package private, to be also fast accessible by NumericRangeTermEnum)
- String field;
+ final String field;
final int precisionStep, valSize;
final T min, max;
final boolean minInclusive,maxInclusive;
@@ -390,15 +385,13 @@
*/
private final class NumericRangeTermsEnum extends FilteredTermsEnum {
- private final IndexReader reader;
- private final LinkedList rangeBounds = new LinkedList();
private TermRef currentUpperBound = null;
- private final boolean empty;
+
+ private final LinkedList rangeBounds = new LinkedList();
private final TermRef.Comparator termComp;
NumericRangeTermsEnum(final IndexReader reader) throws IOException {
- this.reader = reader;
-
+ super(reader, field);
switch (valSize) {
case 64: {
// lower
@@ -475,21 +468,7 @@
throw new IllegalArgumentException("valSize must be 32 or 64");
}
- // initialize iterator
- final Terms terms = reader.fields().terms(field);
- if (terms != null) {
- // TODO: NRQ by design relies on a specific sort
- // order; I think UT8 or UTF16 would work (NRQ encodes
- // to only ASCII).
- termComp = terms.getTermComparator();
- actualEnum = terms.iterator();
- } else {
- termComp = null;
- actualEnum = null;
- }
-
- // seek to first term
- empty = next() == null;
+ termComp = getTermComparator();
}
@Override
@@ -498,53 +477,10 @@
}
@Override
- public boolean empty() {
- return empty;
- }
-
- @Override
- protected TermRef setEnum(TermsEnum actualEnum, TermRef term) throws IOException {
- throw new UnsupportedOperationException("not implemented");
- }
-
- @Override
- public SeekStatus seek(TermRef term) throws IOException {
- throw new UnsupportedOperationException("not implemented");
- }
-
- @Override
- public SeekStatus seek(long ord) throws IOException {
- throw new UnsupportedOperationException("not implemented");
- }
-
- @Override
- public String field() {
- return field;
- }
-
- @Override
- protected AcceptStatus accept(TermRef term) {
- return (termComp.compare(term, currentUpperBound) <= 0) ?
- AcceptStatus.YES : AcceptStatus.NO;
- }
-
- @Override
- public TermRef next() throws IOException {
- if (actualEnum == null) {
+ protected final TermRef nextSeekTerm(final boolean enumExhausted) throws IOException {
+ if (enumExhausted)
return null;
- }
-
- // try change to next term, if no such term exists, fall-through
- // (we can only do this if the enum was already seeked)
- if (currentUpperBound != null) {
- final TermRef term = actualEnum.next();
- if (term != null && accept(term) == AcceptStatus.YES) {
- return term;
- }
- }
-
- // if all above fails, we seek forward
- while (rangeBounds.size() >= 2) {
+ if (rangeBounds.size() >= 2) {
assert rangeBounds.size() % 2 == 0;
final TermRef lowerBound = new TermRef(rangeBounds.removeFirst());
@@ -552,22 +488,19 @@
"The current upper bound must be <= the new lower bound";
this.currentUpperBound = new TermRef(rangeBounds.removeFirst());
-
- SeekStatus status = actualEnum.seek(lowerBound);
- if (status == SeekStatus.END) {
- return null;
- }
-
- final TermRef term = actualEnum.term();
- if (accept(term) == AcceptStatus.YES) {
- return term;
- }
+ return lowerBound;
}
// no more sub-range enums available
assert rangeBounds.size() == 0;
return null;
}
+
+ @Override
+ protected AcceptStatus accept(TermRef term) {
+ return (currentUpperBound != null && termComp.compare(term, currentUpperBound) <= 0) ?
+ AcceptStatus.YES : AcceptStatus.NO_AND_SEEK;
+ }
}
Index: src/java/org/apache/lucene/search/PrefixTermsEnum.java
===================================================================
--- src/java/org/apache/lucene/search/PrefixTermsEnum.java (revision 887534)
+++ src/java/org/apache/lucene/search/PrefixTermsEnum.java (working copy)
@@ -21,50 +21,31 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermRef;
/**
* Subclass of FilteredTermEnum for enumerating all terms that match the
* specified prefix filter term.
- *
- * Term enumerations are always ordered by Term.compareTo(). Each term in
- * the enumeration is greater than all that precede it.
- *
+ *
Term enumerations are always ordered by
+ * {@link #getTermComparator}. Each term in the enumeration is
+ * greater than all that precede it.
*/
public class PrefixTermsEnum extends FilteredTermsEnum {
private final Term prefix;
private final TermRef prefixRef;
- private final boolean empty;
public PrefixTermsEnum(IndexReader reader, Term prefix) throws IOException {
+ super(reader, prefix.field());
this.prefix = prefix;
- Terms terms = reader.fields().terms(prefix.field());
- if (terms != null) {
- prefixRef = new TermRef(prefix.text());
- empty = setEnum(terms.iterator(), prefixRef) == null;
- } else {
- empty = true;
- prefixRef = null;
- }
+ setInitialSeekTerm(prefixRef = new TermRef(prefix.text()));
}
@Override
- public String field() {
- return prefix.field();
- }
-
- @Override
public float difference() {
return 1.0f;
}
- @Override
- public boolean empty() {
- return empty;
- }
-
protected Term getPrefixTerm() {
return prefix;
}
Index: src/java/org/apache/lucene/search/RegexpQuery.java
===================================================================
--- src/java/org/apache/lucene/search/RegexpQuery.java (revision 0)
+++ src/java/org/apache/lucene/search/RegexpQuery.java (revision 0)
@@ -0,0 +1,105 @@
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.Term;
+
+import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonProvider;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A fast regular expression query based on the
+ * {@link org.apache.lucene.util.automaton} package.
+ *
+ * - Comparisons are fast
+ *
- The term dictionary is enumerated in an intelligent way, to avoid
+ * comparisons. See {@link AutomatonQuery} for more details.
+ *
+ *
+ * The supported syntax is documented in the {@link RegExp} class.
+ * Note this might be different than other regular expression implementations.
+ * For some alternatives with different syntax, look under contrib/regex
+ *
+ *
+ * Note this query can be slow, as it needs to iterate over many terms. In order
+ * to prevent extremely slow RegexpQueries, a Regexp term should not start with
+ * the expression .*
+ *
+ * @see RegExp
+ */
+public class RegexpQuery extends AutomatonQuery {
+ /**
+ * A provider that provides no named automata
+ */
+ private static AutomatonProvider defaultProvider = new AutomatonProvider() {
+ public Automaton getAutomaton(String name) throws IOException {
+ return null;
+ }
+ };
+
+ /**
+ * Constructs a query for terms matching term.
+ *
+ * By default, all regular expression features are enabled.
+ *
+ *
+ * @param term regular expression.
+ */
+ public RegexpQuery(Term term) {
+ this(term, RegExp.ALL);
+ }
+
+ /**
+ * Constructs a query for terms matching term.
+ *
+ * @param term regular expression.
+ * @param flags optional RegExp features from {@link RegExp}
+ */
+ public RegexpQuery(Term term, int flags) {
+ this(term, flags, defaultProvider);
+ }
+
+ /**
+ * Constructs a query for terms matching term.
+ *
+ * @param term regular expression.
+ * @param flags optional RegExp features from {@link RegExp}
+ * @param provider custom AutomatonProvider for named automata
+ */
+ public RegexpQuery(Term term, int flags, AutomatonProvider provider) {
+ super(term, new RegExp(term.text(), flags).toAutomaton(provider));
+ }
+
+ /** Prints a user-readable version of this query. */
+ @Override
+ public String toString(String field) {
+ StringBuilder buffer = new StringBuilder();
+ if (!term.field().equals(field)) {
+ buffer.append(term.field());
+ buffer.append(":");
+ }
+ buffer.append(term.text());
+ buffer.append(ToStringUtils.boost(getBoost()));
+ return buffer.toString();
+ }
+}
Property changes on: src\java\org\apache\lucene\search\RegexpQuery.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/search/SingleTermsEnum.java
===================================================================
--- src/java/org/apache/lucene/search/SingleTermsEnum.java (revision 887534)
+++ src/java/org/apache/lucene/search/SingleTermsEnum.java (working copy)
@@ -31,10 +31,8 @@
* but want to preserve MultiTermQuery semantics such as
* {@link MultiTermQuery#rewriteMethod}.
*/
-public class SingleTermsEnum extends FilteredTermsEnum {
- private final Term singleTerm;
+public final class SingleTermsEnum extends FilteredTermsEnum {
private final TermRef singleRef;
- private final boolean empty;
/**
* Creates a new SingleTermsEnum.
@@ -43,38 +41,18 @@
* if it exists.
*/
public SingleTermsEnum(IndexReader reader, Term singleTerm) throws IOException {
- this.singleTerm = singleTerm;
- Terms terms = reader.fields().terms(singleTerm.field());
- if (terms != null) {
- singleRef = new TermRef(singleTerm.text());
- empty = setEnum(terms.iterator(), singleRef) == null;
- } else {
- empty = true;
- singleRef = null;
- }
+ super(reader, singleTerm.field());
+ singleRef = new TermRef(singleTerm.text());
+ setInitialSeekTerm(singleRef);
}
@Override
protected AcceptStatus accept(TermRef term) {
- if (term.equals(singleRef)) {
- return AcceptStatus.YES;
- } else {
- return AcceptStatus.END;
- }
+ return term.equals(singleRef) ? AcceptStatus.YES : AcceptStatus.END;
}
@Override
public float difference() {
return 1.0F;
}
-
- @Override
- public boolean empty() {
- return empty;
- }
-
- @Override
- public String field() {
- return singleTerm.field();
- }
}
Index: src/java/org/apache/lucene/search/TermRangeQuery.java
===================================================================
--- src/java/org/apache/lucene/search/TermRangeQuery.java (revision 887534)
+++ src/java/org/apache/lucene/search/TermRangeQuery.java (working copy)
@@ -142,10 +142,10 @@
@Override
protected FilteredTermsEnum getTermsEnum(IndexReader reader) throws IOException {
- return new TermRangeTermsEnum(reader, field,
- lowerTerm, upperTerm,
- includeLower, includeUpper,
- collator);
+ return (collator == null && lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) ?
+ new EmptyTermsEnum(field) :
+ new TermRangeTermsEnum(reader, field,
+ lowerTerm, upperTerm, includeLower, includeUpper, collator);
}
/** Prints a user-readable version of this query. */
Index: src/java/org/apache/lucene/search/TermRangeTermsEnum.java
===================================================================
--- src/java/org/apache/lucene/search/TermRangeTermsEnum.java (revision 887534)
+++ src/java/org/apache/lucene/search/TermRangeTermsEnum.java (working copy)
@@ -22,28 +22,24 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermRef;
-import org.apache.lucene.index.Terms;
-//import org.apache.lucene.index.Term;
import org.apache.lucene.util.StringHelper;
/**
* Subclass of FilteredTermEnum for enumerating all terms that match the
* specified range parameters.
- *
- * Term enumerations are always ordered by Term.compareTo(). Each term in
- * the enumeration is greater than all that precede it.
+ *
Term enumerations are always ordered by
+ * {@link #getTermComparator}. Each term in the enumeration is
+ * greater than all that precede it.
*/
public class TermRangeTermsEnum extends FilteredTermsEnum {
private Collator collator;
- private String field;
private String upperTermText;
private String lowerTermText;
private boolean includeLower;
private boolean includeUpper;
final private TermRef lowerTermRef;
final private TermRef upperTermRef;
- private final boolean empty;
private final TermRef.Comparator termComp;
/**
@@ -75,12 +71,12 @@
*/
public TermRangeTermsEnum(IndexReader reader, String field, String lowerTermText, String upperTermText,
boolean includeLower, boolean includeUpper, Collator collator) throws IOException {
+ super(reader, field);
this.collator = collator;
this.upperTermText = upperTermText;
this.lowerTermText = lowerTermText;
this.includeLower = includeLower;
this.includeUpper = includeUpper;
- this.field = StringHelper.intern(field);
// do a little bit of normalization...
// open ended range queries should always be inclusive.
@@ -97,22 +93,9 @@
upperTermRef = new TermRef(upperTermText);
}
- String startTermText = collator == null ? this.lowerTermText : "";
- Terms terms = reader.fields().terms(field);
-
- if (terms != null) {
- termComp = terms.getTermComparator();
- final boolean foundFirstTerm = setEnum(terms.iterator(), new TermRef(startTermText)) != null;
-
- if (foundFirstTerm && collator == null && !this.includeLower && term().termEquals(lowerTermRef)) {
- empty = next() == null;
- } else {
- empty = !foundFirstTerm;
- }
- } else {
- empty = true;
- termComp = null;
- }
+ TermRef startTermRef = (collator == null) ? lowerTermRef : new TermRef("");
+ setInitialSeekTerm(startTermRef);
+ termComp = getTermComparator();
}
@Override
@@ -121,18 +104,10 @@
}
@Override
- public boolean empty() {
- return empty;
- }
-
- @Override
- public String field() {
- return field;
- }
-
- @Override
protected AcceptStatus accept(TermRef term) {
if (collator == null) {
+ if (!this.includeLower && term.equals(lowerTermRef))
+ return AcceptStatus.NO;
// Use this field's default sort ordering
if (upperTermRef != null) {
final int cmp = termComp.compare(upperTermRef, term);
Index: src/java/org/apache/lucene/search/WildcardQuery.java
===================================================================
--- src/java/org/apache/lucene/search/WildcardQuery.java (revision 887534)
+++ src/java/org/apache/lucene/search/WildcardQuery.java (working copy)
@@ -17,76 +17,67 @@
* limitations under the License.
*/
-import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
-import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
/** Implements the wildcard search query. Supported wildcards are *, which
* matches any character sequence (including the empty one), and ?,
* which matches any single character. Note this query can be slow, as it
* needs to iterate over many terms. In order to prevent extremely slow WildcardQueries,
- * a Wildcard term should not start with one of the wildcards * or
- * ?.
+ * a Wildcard term should not start with the wildcard *.
*
* This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
* rewrite method.
*
* @see WildcardTermEnums */
-public class WildcardQuery extends MultiTermQuery {
- private boolean termContainsWildcard;
- private boolean termIsPrefix;
- protected Term term;
-
- public WildcardQuery(Term term) {
- this.term = term;
- String text = term.text();
- this.termContainsWildcard = (text.indexOf('*') != -1)
- || (text.indexOf('?') != -1);
- this.termIsPrefix = termContainsWildcard
- && (text.indexOf('?') == -1)
- && (text.indexOf('*') == text.length() - 1);
- }
-
- @Override
- protected FilteredTermsEnum getTermsEnum(IndexReader reader) throws IOException {
- if (termContainsWildcard)
- return new WildcardTermsEnum(reader, getTerm());
- else
- return new SingleTermsEnum(reader, getTerm());
- }
-
- // @deprecated see getTermsEnum
- @Override
- protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
- if (termContainsWildcard)
- return new WildcardTermEnum(reader, getTerm());
- else
- return new SingleTermEnum(reader, getTerm());
- }
-
- /**
- * Returns the pattern term.
- */
- public Term getTerm() {
- return term;
- }
- @Override
- public Query rewrite(IndexReader reader) throws IOException {
- if (termIsPrefix) {
- MultiTermQuery rewritten = new PrefixQuery(term.createTerm(term.text()
- .substring(0, term.text().indexOf('*'))));
- rewritten.setBoost(getBoost());
- rewritten.setRewriteMethod(getRewriteMethod());
- return rewritten;
- } else {
- return super.rewrite(reader);
- }
- }
-
+ public class WildcardQuery extends AutomatonQuery {
+
+ /** String equality with support for wildcards */
+ public static final char WILDCARD_STRING = '*';
+
+ /** Char equality with support for wildcards */
+ public static final char WILDCARD_CHAR = '?';
+
+ /**
+ * Constructs a query for terms matching term.
+ */
+ public WildcardQuery(Term term) {
+ super(term, toAutomaton(term));
+ }
+
+ /**
+ * Convert Lucene wildcard syntax into an automaton.
+ */
+ static Automaton toAutomaton(Term wildcardquery) {
+ List automata = new ArrayList();
+
+ String wildcardText = wildcardquery.text();
+
+ for (int i = 0; i < wildcardText.length(); i++) {
+ final char c = wildcardText.charAt(i);
+ switch(c) {
+ case WILDCARD_STRING:
+ automata.add(BasicAutomata.makeAnyString());
+ break;
+ case WILDCARD_CHAR:
+ automata.add(BasicAutomata.makeAnyChar());
+ break;
+ default:
+ automata.add(BasicAutomata.makeChar(c));
+ }
+ }
+
+ return BasicOperations.concatenate(automata);
+ }
+
/** Prints a user-readable version of this query. */
@Override
public String toString(String field) {
@@ -99,30 +90,4 @@
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
-
- @Override
- public int hashCode() {
- final int prime = 31;
- int result = super.hashCode();
- result = prime * result + ((term == null) ? 0 : term.hashCode());
- return result;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (!super.equals(obj))
- return false;
- if (getClass() != obj.getClass())
- return false;
- WildcardQuery other = (WildcardQuery) obj;
- if (term == null) {
- if (other.term != null)
- return false;
- } else if (!term.equals(other.term))
- return false;
- return true;
- }
-
}
Index: src/java/org/apache/lucene/search/WildcardTermEnum.java
===================================================================
--- src/java/org/apache/lucene/search/WildcardTermEnum.java (revision 887534)
+++ src/java/org/apache/lucene/search/WildcardTermEnum.java (working copy)
@@ -30,13 +30,7 @@
* the enumeration is greater than all that precede it.
* @deprecated Please use {@link WildcardTermsEnum} instead.
*/
-public class WildcardTermEnum extends FilteredTermEnum {
- final Term searchTerm;
- final String field;
- final String text;
- final String pre;
- final int preLen;
- boolean endEnum = false;
+public class WildcardTermEnum extends AutomatonTermEnum {
/**
* Creates a new WildcardTermEnum.
@@ -45,60 +39,21 @@
* valid term if such a term exists.
*/
public WildcardTermEnum(IndexReader reader, Term term) throws IOException {
- super();
- searchTerm = term;
- field = searchTerm.field();
- final String searchTermText = searchTerm.text();
-
- final int sidx = searchTermText.indexOf(WILDCARD_STRING);
- final int cidx = searchTermText.indexOf(WILDCARD_CHAR);
- int idx = sidx;
- if (idx == -1) {
- idx = cidx;
- }
- else if (cidx >= 0) {
- idx = Math.min(idx, cidx);
- }
- pre = idx != -1?searchTerm.text().substring(0,idx): "";
-
- preLen = pre.length();
- text = searchTermText.substring(preLen);
- setEnum(reader.terms(new Term(searchTerm.field(), pre)));
+ super(WildcardQuery.toAutomaton(term), term, reader);
}
- @Override
- protected final boolean termCompare(Term term) {
- if (field == term.field()) {
- String searchText = term.text();
- if (searchText.startsWith(pre)) {
- return wildcardEquals(text, 0, searchText, preLen);
- }
- }
- endEnum = true;
- return false;
- }
+ /** String equality with support for wildcards */
+ public static final char WILDCARD_STRING = WildcardQuery.WILDCARD_STRING;
- @Override
- public float difference() {
- return 1.0f;
- }
+ /** Char equality with support for wildcards */
+ public static final char WILDCARD_CHAR = WildcardQuery.WILDCARD_CHAR;
- @Override
- public final boolean endEnum() {
- return endEnum;
- }
-
- /********************************************
- * String equality with support for wildcards
- ********************************************/
-
- public static final char WILDCARD_STRING = '*';
- public static final char WILDCARD_CHAR = '?';
-
/**
* Determines if a word matches a wildcard pattern.
* Work released by Granta Design Ltd after originally being done on
* company time.
+ * Note: This method is no longer used by this class!
+ * It is dead code and only available for backwards compatibility.
*/
public static final boolean wildcardEquals(String pattern, int patternIdx,
String string, int stringIdx)
Index: src/java/org/apache/lucene/search/WildcardTermsEnum.java
===================================================================
--- src/java/org/apache/lucene/search/WildcardTermsEnum.java (revision 887534)
+++ src/java/org/apache/lucene/search/WildcardTermsEnum.java (working copy)
@@ -35,11 +35,9 @@
*/
public class WildcardTermsEnum extends FilteredTermsEnum {
final Term searchTerm;
- final String field;
final String text;
final String pre;
final int preLen;
- private final boolean empty;
private final TermRef preTermRef;
/**
@@ -49,9 +47,8 @@
* valid term if such a term exists.
*/
public WildcardTermsEnum(IndexReader reader, Term term) throws IOException {
- super();
- searchTerm = term;
- field = searchTerm.field();
+ super(reader, term.field());
+ this.searchTerm = term;
final String searchTermText = searchTerm.text();
final int sidx = searchTermText.indexOf(WILDCARD_STRING);
@@ -67,22 +64,10 @@
preLen = pre.length();
text = searchTermText.substring(preLen);
- preTermRef = new TermRef(pre);
-
- Terms terms = reader.fields().terms(searchTerm.field());
- if (terms != null) {
- empty = setEnum(terms.iterator(), preTermRef) == null;
- } else {
- empty = true;
- }
+ setInitialSeekTerm(preTermRef = new TermRef(pre));
}
@Override
- public String field() {
- return searchTerm.field();
- }
-
- @Override
protected final AcceptStatus accept(TermRef term) {
if (term.startsWith(preTermRef)) {
// TODO: would be better, but trickier, to not have to
@@ -104,11 +89,6 @@
return 1.0f;
}
- @Override
- public final boolean empty() {
- return empty;
- }
-
/********************************************
* String equality with support for wildcards
********************************************/
Index: src/java/org/apache/lucene/util/automaton/Automaton.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/Automaton.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/Automaton.java (revision 0)
@@ -0,0 +1,748 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Finite-state automaton with regular expression operations.
+ *
+ * Class invariants:
+ *
+ * - An automaton is either represented explicitly (with {@link State} and
+ * {@link Transition} objects) or with a singleton string (see
+ * {@link #getSingleton()} and {@link #expandSingleton()}) in case the automaton
+ * is known to accept exactly one string. (Implicitly, all states and
+ * transitions of an automaton are reachable from its initial state.)
+ *
- Automata are always reduced (see {@link #reduce()}) and have no
+ * transitions to dead states (see {@link #removeDeadTransitions()}).
+ *
- If an automaton is nondeterministic, then {@link #isDeterministic()}
+ * returns false (but the converse is not required).
+ *
- Automata provided as input to operations are generally assumed to be
+ * disjoint.
+ *
+ *
+ * If the states or transitions are manipulated manually, the
+ * {@link #restoreInvariant()} and {@link #setDeterministic(boolean)} methods
+ * should be used afterwards to restore representation invariants that are
+ * assumed by the built-in automata operations.
+ *
+ *
+ * WARNING: The status of the Automaton feature is
+ * experimental. The APIs introduced here might change in the future and will
+ * not be supported anymore in such a case.
+ */
+public class Automaton implements Serializable, Cloneable {
+
+ static final long serialVersionUID = 10001;
+
+ /**
+ * Minimize using Hopcroft's O(n log n) algorithm. This is regarded as one of
+ * the most generally efficient algorithms that exist.
+ *
+ * @see #setMinimization(int)
+ */
+ public static final int MINIMIZE_HOPCROFT = 2;
+
+ /** Selects minimization algorithm (default: MINIMIZE_HOPCROFT). */
+ static int minimization = MINIMIZE_HOPCROFT;
+
+ /** Initial state of this automaton. */
+ State initial;
+
+ /**
+ * If true, then this automaton is definitely deterministic (i.e., there are
+ * no choices for any run, but a run may crash).
+ */
+ boolean deterministic;
+
+ /** Extra data associated with this automaton. */
+ transient Object info;
+
+ /**
+ * Hash code. Recomputed by {@link MinimizationOperations#minimize(Automaton)}
+ */
+ int hash_code;
+
+ /** Singleton string. Null if not applicable. */
+ String singleton;
+
+ /** Minimize always flag. */
+ static boolean minimize_always = false;
+
+ /**
+ * Selects whether operations may modify the input automata (default:
+ * false).
+ */
+ static boolean allow_mutation = false;
+
+ /**
+ * Constructs a new automaton that accepts the empty language. Using this
+ * constructor, automata can be constructed manually from {@link State} and
+ * {@link Transition} objects.
+ *
+ * @see #setInitialState(State)
+ * @see State
+ * @see Transition
+ */
+ public Automaton() {
+ initial = new State();
+ deterministic = true;
+ singleton = null;
+ }
+
+ boolean isDebug() {
+ return System.getProperty("dk.brics.automaton.debug") != null;
+ }
+
+ /**
+ * Selects minimization algorithm (default: MINIMIZE_HOPCROFT).
+ *
+ * @param algorithm minimization algorithm
+ */
+ static public void setMinimization(int algorithm) {
+ minimization = algorithm;
+ }
+
+ /**
+ * Sets or resets minimize always flag. If this flag is set, then
+ * {@link MinimizationOperations#minimize(Automaton)} will automatically be
+ * invoked after all operations that otherwise may produce non-minimal
+ * automata. By default, the flag is not set.
+ *
+ * @param flag if true, the flag is set
+ */
+ static public void setMinimizeAlways(boolean flag) {
+ minimize_always = flag;
+ }
+
+ /**
+ * Sets or resets allow mutate flag. If this flag is set, then all automata
+ * operations may modify automata given as input; otherwise, operations will
+ * always leave input automata languages unmodified. By default, the flag is
+ * not set.
+ *
+ * @param flag if true, the flag is set
+ * @return previous value of the flag
+ */
+ static public boolean setAllowMutate(boolean flag) {
+ boolean b = allow_mutation;
+ allow_mutation = flag;
+ return b;
+ }
+
+ /**
+ * Returns the state of the allow mutate flag. If this flag is set, then all
+ * automata operations may modify automata given as input; otherwise,
+ * operations will always leave input automata languages unmodified. By
+ * default, the flag is not set.
+ *
+ * @return current value of the flag
+ */
+ static boolean getAllowMutate() {
+ return allow_mutation;
+ }
+
+ void checkMinimizeAlways() {
+ if (minimize_always) MinimizationOperations.minimize(this);
+ }
+
+ boolean isSingleton() {
+ return singleton != null;
+ }
+
+ /**
+ * Returns the singleton string for this automaton. An automaton that accepts
+ * exactly one string may be represented in singleton mode. In that
+ * case, this method may be used to obtain the string.
+ *
+ * @return string, null if this automaton is not in singleton mode.
+ */
+ public String getSingleton() {
+ return singleton;
+ }
+
+ /**
+ * Sets initial state.
+ *
+ * @param s state
+ */
+ public void setInitialState(State s) {
+ initial = s;
+ singleton = null;
+ }
+
+ /**
+ * Gets initial state.
+ *
+ * @return state
+ */
+ public State getInitialState() {
+ expandSingleton();
+ return initial;
+ }
+
+ /**
+ * Returns deterministic flag for this automaton.
+ *
+ * @return true if the automaton is definitely deterministic, false if the
+ * automaton may be nondeterministic
+ */
+ public boolean isDeterministic() {
+ return deterministic;
+ }
+
+ /**
+ * Sets deterministic flag for this automaton. This method should (only) be
+ * used if automata are constructed manually.
+ *
+ * @param deterministic true if the automaton is definitely deterministic,
+ * false if the automaton may be nondeterministic
+ */
+ public void setDeterministic(boolean deterministic) {
+ this.deterministic = deterministic;
+ }
+
+ /**
+ * Associates extra information with this automaton.
+ *
+ * @param info extra information
+ */
+ public void setInfo(Object info) {
+ this.info = info;
+ }
+
+ /**
+ * Returns extra information associated with this automaton.
+ *
+ * @return extra information
+ * @see #setInfo(Object)
+ */
+ public Object getInfo() {
+ return info;
+ }
+
+ /**
+ * Returns the set of states that are reachable from the initial state.
+ *
+ * @return set of {@link State} objects
+ */
+ public Set getStates() {
+ expandSingleton();
+ Set visited;
+ if (isDebug()) visited = new LinkedHashSet();
+ else visited = new HashSet();
+ LinkedList worklist = new LinkedList();
+ worklist.add(initial);
+ visited.add(initial);
+ while (worklist.size() > 0) {
+ State s = worklist.removeFirst();
+ Collection tr;
+ if (isDebug()) tr = s.getSortedTransitions(false);
+ else tr = s.transitions;
+ for (Transition t : tr)
+ if (!visited.contains(t.to)) {
+ visited.add(t.to);
+ worklist.add(t.to);
+ }
+ }
+ return visited;
+ }
+
+ /**
+ * Returns the set of reachable accept states.
+ *
+ * @return set of {@link State} objects
+ */
+ public Set getAcceptStates() {
+ expandSingleton();
+ HashSet accepts = new HashSet();
+ HashSet visited = new HashSet();
+ LinkedList worklist = new LinkedList();
+ worklist.add(initial);
+ visited.add(initial);
+ while (worklist.size() > 0) {
+ State s = worklist.removeFirst();
+ if (s.accept) accepts.add(s);
+ for (Transition t : s.transitions)
+ if (!visited.contains(t.to)) {
+ visited.add(t.to);
+ worklist.add(t.to);
+ }
+ }
+ return accepts;
+ }
+
+ /**
+ * Assigns consecutive numbers to the given states.
+ */
+ static void setStateNumbers(Set states) {
+ int number = 0;
+ for (State s : states)
+ s.number = number++;
+ }
+
+ /**
+ * Adds transitions to explicit crash state to ensure that transition function
+ * is total.
+ */
+ void totalize() {
+ State s = new State();
+ s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+ s));
+ for (State p : getStates()) {
+ int maxi = Character.MIN_VALUE;
+ for (Transition t : p.getSortedTransitions(false)) {
+ if (t.min > maxi) p.transitions.add(new Transition((char) maxi,
+ (char) (t.min - 1), s));
+ if (t.max + 1 > maxi) maxi = t.max + 1;
+ }
+ if (maxi <= Character.MAX_VALUE) p.transitions.add(new Transition(
+ (char) maxi, Character.MAX_VALUE, s));
+ }
+ }
+
+ /**
+ * Restores representation invariant. This method must be invoked before any
+ * built-in automata operation is performed if automaton states or transitions
+ * are manipulated manually.
+ *
+ * @see #setDeterministic(boolean)
+ */
+ public void restoreInvariant() {
+ removeDeadTransitions();
+ }
+
+ /**
+ * Reduces this automaton. An automaton is "reduced" by combining overlapping
+ * and adjacent edge intervals with same destination.
+ */
+ public void reduce() {
+ if (isSingleton()) return;
+ Set states = getStates();
+ setStateNumbers(states);
+ for (State s : states) {
+ List st = s.getSortedTransitions(true);
+ s.resetTransitions();
+ State p = null;
+ int min = -1, max = -1;
+ for (Transition t : st) {
+ if (p == t.to) {
+ if (t.min <= max + 1) {
+ if (t.max > max) max = t.max;
+ } else {
+ if (p != null) s.transitions.add(new Transition((char) min,
+ (char) max, p));
+ min = t.min;
+ max = t.max;
+ }
+ } else {
+ if (p != null) s.transitions.add(new Transition((char) min,
+ (char) max, p));
+ p = t.to;
+ min = t.min;
+ max = t.max;
+ }
+ }
+ if (p != null) s.transitions
+ .add(new Transition((char) min, (char) max, p));
+ }
+ }
+
+ /**
+ * Returns sorted array of all interval start points.
+ */
+ char[] getStartPoints() {
+ Set pointset = new HashSet();
+ for (State s : getStates()) {
+ pointset.add(Character.MIN_VALUE);
+ for (Transition t : s.transitions) {
+ pointset.add(t.min);
+ if (t.max < Character.MAX_VALUE) pointset.add((char) (t.max + 1));
+ }
+ }
+ char[] points = new char[pointset.size()];
+ int n = 0;
+ for (Character m : pointset)
+ points[n++] = m;
+ Arrays.sort(points);
+ return points;
+ }
+
+ /**
+ * Returns the set of live states. A state is "live" if an accept state is
+ * reachable from it.
+ *
+ * @return set of {@link State} objects
+ */
+ public Set getLiveStates() {
+ expandSingleton();
+ return getLiveStates(getStates());
+ }
+
+ private Set getLiveStates(Set states) {
+ HashMap> map = new HashMap>();
+ for (State s : states)
+ map.put(s, new HashSet());
+ for (State s : states)
+ for (Transition t : s.transitions)
+ map.get(t.to).add(s);
+ Set live = new HashSet(getAcceptStates());
+ LinkedList worklist = new LinkedList(live);
+ while (worklist.size() > 0) {
+ State s = worklist.removeFirst();
+ for (State p : map.get(s))
+ if (!live.contains(p)) {
+ live.add(p);
+ worklist.add(p);
+ }
+ }
+ return live;
+ }
+
+ /**
+ * Removes transitions to dead states and calls {@link #reduce()} and
+ * {@link #clearHashCode()}. (A state is "dead" if no accept state is
+ * reachable from it.)
+ */
+ public void removeDeadTransitions() {
+ clearHashCode();
+ if (isSingleton()) return;
+ Set states = getStates();
+ Set live = getLiveStates(states);
+ for (State s : states) {
+ Set st = s.transitions;
+ s.resetTransitions();
+ for (Transition t : st)
+ if (live.contains(t.to)) s.transitions.add(t);
+ }
+ reduce();
+ }
+
+ /**
+ * Returns a sorted array of transitions for each state (and sets state
+ * numbers).
+ */
+ static Transition[][] getSortedTransitions(Set states) {
+ setStateNumbers(states);
+ Transition[][] transitions = new Transition[states.size()][];
+ for (State s : states)
+ transitions[s.number] = s.getSortedTransitionArray(false);
+ return transitions;
+ }
+
+ /**
+ * Expands singleton representation to normal representation. Does nothing if
+ * not in singleton representation.
+ */
+ public void expandSingleton() {
+ if (isSingleton()) {
+ State p = new State();
+ initial = p;
+ for (int i = 0; i < singleton.length(); i++) {
+ State q = new State();
+ p.transitions.add(new Transition(singleton.charAt(i), q));
+ p = q;
+ }
+ p.accept = true;
+ deterministic = true;
+ singleton = null;
+ }
+ }
+
+ /**
+ * Returns the number of states in this automaton.
+ */
+ public int getNumberOfStates() {
+ if (isSingleton()) return singleton.length() + 1;
+ return getStates().size();
+ }
+
+ /**
+ * Returns the number of transitions in this automaton. This number is counted
+ * as the total number of edges, where one edge may be a character interval.
+ */
+ public int getNumberOfTransitions() {
+ if (isSingleton()) return singleton.length();
+ int c = 0;
+ for (State s : getStates())
+ c += s.transitions.size();
+ return c;
+ }
+
+ /**
+ * Returns true if the language of this automaton is equal to the language of
+ * the given automaton. Implemented using hashCode and
+ * subsetOf.
+ */
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == this) return true;
+ if (!(obj instanceof Automaton)) return false;
+ Automaton a = (Automaton) obj;
+ if (isSingleton() && a.isSingleton()) return singleton.equals(a.singleton);
+ return hashCode() == a.hashCode() && BasicOperations.subsetOf(this, a)
+ && BasicOperations.subsetOf(a, this);
+ }
+
+ /**
+ * Returns hash code for this automaton. The hash code is based on the number
+ * of states and transitions in the minimized automaton. Invoking this method
+ * may involve minimizing the automaton.
+ */
+ @Override
+ public int hashCode() {
+ if (hash_code == 0) MinimizationOperations.minimize(this);
+ return hash_code;
+ }
+
+ /**
+ * Must be invoked when the stored hash code may no longer be valid.
+ */
+ void clearHashCode() {
+ hash_code = 0;
+ }
+
+ /**
+ * Returns a string representation of this automaton.
+ */
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ if (isSingleton()) {
+ b.append("singleton: ");
+ for (char c : singleton.toCharArray())
+ Transition.appendCharString(c, b);
+ b.append("\n");
+ } else {
+ Set states = getStates();
+ setStateNumbers(states);
+ b.append("initial state: ").append(initial.number).append("\n");
+ for (State s : states)
+ b.append(s.toString());
+ }
+ return b.toString();
+ }
+
+ /**
+ * Returns Graphviz Dot representation of this automaton.
+ */
+ public String toDot() {
+ StringBuilder b = new StringBuilder("digraph Automaton {\n");
+ b.append(" rankdir = LR;\n");
+ Set states = getStates();
+ setStateNumbers(states);
+ for (State s : states) {
+ b.append(" ").append(s.number);
+ if (s.accept) b.append(" [shape=doublecircle,label=\"\"];\n");
+ else b.append(" [shape=circle,label=\"\"];\n");
+ if (s == initial) {
+ b.append(" initial [shape=plaintext,label=\"\"];\n");
+ b.append(" initial -> ").append(s.number).append("\n");
+ }
+ for (Transition t : s.transitions) {
+ b.append(" ").append(s.number);
+ t.appendDot(b);
+ }
+ }
+ return b.append("}\n").toString();
+ }
+
+ /**
+ * Returns a clone of this automaton, expands if singleton.
+ */
+ Automaton cloneExpanded() {
+ Automaton a = clone();
+ a.expandSingleton();
+ return a;
+ }
+
+ /**
+ * Returns a clone of this automaton unless allow_mutation is
+ * set, expands if singleton.
+ */
+ Automaton cloneExpandedIfRequired() {
+ if (allow_mutation) {
+ expandSingleton();
+ return this;
+ } else return cloneExpanded();
+ }
+
+ /**
+ * Returns a clone of this automaton.
+ */
+ @Override
+ public Automaton clone() {
+ try {
+ Automaton a = (Automaton) super.clone();
+ if (!isSingleton()) {
+ HashMap m = new HashMap();
+ Set states = getStates();
+ for (State s : states)
+ m.put(s, new State());
+ for (State s : states) {
+ State p = m.get(s);
+ p.accept = s.accept;
+ if (s == initial) a.initial = p;
+ for (Transition t : s.transitions)
+ p.transitions.add(new Transition(t.min, t.max, m.get(t.to)));
+ }
+ }
+ return a;
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Returns a clone of this automaton, or this automaton itself if
+ * allow_mutation flag is set.
+ */
+ Automaton cloneIfRequired() {
+ if (allow_mutation) return this;
+ else return clone();
+ }
+
+ /**
+ * See {@link BasicOperations#concatenate(Automaton, Automaton)}.
+ */
+ public Automaton concatenate(Automaton a) {
+ return BasicOperations.concatenate(this, a);
+ }
+
+ /**
+ * See {@link BasicOperations#concatenate(List)}.
+ */
+ static public Automaton concatenate(List l) {
+ return BasicOperations.concatenate(l);
+ }
+
+ /**
+ * See {@link BasicOperations#optional(Automaton)}.
+ */
+ public Automaton optional() {
+ return BasicOperations.optional(this);
+ }
+
+ /**
+ * See {@link BasicOperations#repeat(Automaton)}.
+ */
+ public Automaton repeat() {
+ return BasicOperations.repeat(this);
+ }
+
+ /**
+ * See {@link BasicOperations#repeat(Automaton, int)}.
+ */
+ public Automaton repeat(int min) {
+ return BasicOperations.repeat(this, min);
+ }
+
+ /**
+ * See {@link BasicOperations#repeat(Automaton, int, int)}.
+ */
+ public Automaton repeat(int min, int max) {
+ return BasicOperations.repeat(this, min, max);
+ }
+
+ /**
+ * See {@link BasicOperations#complement(Automaton)}.
+ */
+ public Automaton complement() {
+ return BasicOperations.complement(this);
+ }
+
+ /**
+ * See {@link BasicOperations#minus(Automaton, Automaton)}.
+ */
+ public Automaton minus(Automaton a) {
+ return BasicOperations.minus(this, a);
+ }
+
+ /**
+ * See {@link BasicOperations#intersection(Automaton, Automaton)}.
+ */
+ public Automaton intersection(Automaton a) {
+ return BasicOperations.intersection(this, a);
+ }
+
+ /**
+ * See {@link BasicOperations#subsetOf(Automaton, Automaton)}.
+ */
+ public boolean subsetOf(Automaton a) {
+ return BasicOperations.subsetOf(this, a);
+ }
+
+ /**
+ * See {@link BasicOperations#union(Automaton, Automaton)}.
+ */
+ public Automaton union(Automaton a) {
+ return BasicOperations.union(this, a);
+ }
+
+ /**
+ * See {@link BasicOperations#union(Collection)}.
+ */
+ static public Automaton union(Collection l) {
+ return BasicOperations.union(l);
+ }
+
+ /**
+ * See {@link BasicOperations#determinize(Automaton)}.
+ */
+ public void determinize() {
+ BasicOperations.determinize(this);
+ }
+
+ /**
+ * See {@link BasicOperations#isEmptyString(Automaton)}.
+ */
+ public boolean isEmptyString() {
+ return BasicOperations.isEmptyString(this);
+ }
+
+ /**
+ * See {@link MinimizationOperations#minimize(Automaton)}. Returns the
+ * automaton being given as argument.
+ */
+ public static Automaton minimize(Automaton a) {
+ MinimizationOperations.minimize(a);
+ return a;
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\Automaton.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/AutomatonProvider.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (revision 0)
@@ -0,0 +1,53 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.IOException;
+
+/**
+ * Automaton provider for RegExp.
+ * {@link RegExp#toAutomaton(AutomatonProvider)}
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public interface AutomatonProvider {
+
+ /**
+ * Returns automaton of the given name.
+ *
+ * @param name automaton name
+ * @return automaton
+ * @throws IOException if errors occur
+ */
+ public Automaton getAutomaton(String name) throws IOException;
+}
Property changes on: src\java\org\apache\lucene\util\automaton\AutomatonProvider.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/BasicAutomata.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 0)
@@ -0,0 +1,482 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Construction of basic automata.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class BasicAutomata {
+ // used by getWhitespaceAutomaton to match basic whitespace
+ private static final Automaton ws = Automaton.minimize(BasicAutomata
+ .makeCharSet(" \t\n\r").repeat());
+
+ private BasicAutomata() {}
+
+ /**
+ * Returns a new (deterministic) automaton with the empty language.
+ */
+ public static Automaton makeEmpty() {
+ Automaton a = new Automaton();
+ State s = new State();
+ a.initial = s;
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts only the empty string.
+ */
+ public static Automaton makeEmptyString() {
+ Automaton a = new Automaton();
+ a.singleton = "";
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts all strings.
+ */
+ public static Automaton makeAnyString() {
+ Automaton a = new Automaton();
+ State s = new State();
+ a.initial = s;
+ s.accept = true;
+ s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+ s));
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts any single character.
+ */
+ public static Automaton makeAnyChar() {
+ return makeCharRange(Character.MIN_VALUE, Character.MAX_VALUE);
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts a single character of
+ * the given value.
+ */
+ public static Automaton makeChar(char c) {
+ Automaton a = new Automaton();
+ a.singleton = Character.toString(c);
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts a single char whose
+ * value is in the given interval (including both end points).
+ */
+ public static Automaton makeCharRange(char min, char max) {
+ if (min == max) return makeChar(min);
+ Automaton a = new Automaton();
+ State s1 = new State();
+ State s2 = new State();
+ a.initial = s1;
+ s2.accept = true;
+ if (min <= max) s1.transitions.add(new Transition(min, max, s2));
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts a single character in
+ * the given set.
+ */
+ public static Automaton makeCharSet(String set) {
+ if (set.length() == 1) return makeChar(set.charAt(0));
+ Automaton a = new Automaton();
+ State s1 = new State();
+ State s2 = new State();
+ a.initial = s1;
+ s2.accept = true;
+ for (int i = 0; i < set.length(); i++)
+ s1.transitions.add(new Transition(set.charAt(i), s2));
+ a.deterministic = true;
+ a.reduce();
+ return a;
+ }
+
+ /**
+ * Constructs sub-automaton corresponding to decimal numbers of length
+ * x.substring(n).length().
+ */
+ private static State anyOfRightLength(String x, int n) {
+ State s = new State();
+ if (x.length() == n) s.setAccept(true);
+ else s.addTransition(new Transition('0', '9', anyOfRightLength(x, n + 1)));
+ return s;
+ }
+
+ /**
+ * Constructs sub-automaton corresponding to decimal numbers of value at least
+ * x.substring(n) and length x.substring(n).length().
+ */
+ private static State atLeast(String x, int n, Collection initials,
+ boolean zeros) {
+ State s = new State();
+ if (x.length() == n) s.setAccept(true);
+ else {
+ if (zeros) initials.add(s);
+ char c = x.charAt(n);
+ s.addTransition(new Transition(c, atLeast(x, n + 1, initials, zeros
+ && c == '0')));
+ if (c < '9') s.addTransition(new Transition((char) (c + 1), '9',
+ anyOfRightLength(x, n + 1)));
+ }
+ return s;
+ }
+
+ /**
+ * Constructs sub-automaton corresponding to decimal numbers of value at most
+ * x.substring(n) and length x.substring(n).length().
+ */
+ private static State atMost(String x, int n) {
+ State s = new State();
+ if (x.length() == n) s.setAccept(true);
+ else {
+ char c = x.charAt(n);
+ s.addTransition(new Transition(c, atMost(x, (char) n + 1)));
+ if (c > '0') s.addTransition(new Transition('0', (char) (c - 1),
+ anyOfRightLength(x, n + 1)));
+ }
+ return s;
+ }
+
+ /**
+ * Constructs sub-automaton corresponding to decimal numbers of value between
+ * x.substring(n) and y.substring(n) and of length x.substring(n).length()
+ * (which must be equal to y.substring(n).length()).
+ */
+ private static State between(String x, String y, int n,
+ Collection initials, boolean zeros) {
+ State s = new State();
+ if (x.length() == n) s.setAccept(true);
+ else {
+ if (zeros) initials.add(s);
+ char cx = x.charAt(n);
+ char cy = y.charAt(n);
+ if (cx == cy) s.addTransition(new Transition(cx, between(x, y, n + 1,
+ initials, zeros && cx == '0')));
+ else { // cx0, use fixed number of digits (strings must be prefixed
+ * by 0's to obtain the right length) - otherwise, the number of
+ * digits is not fixed
+ * @exception IllegalArgumentException if min>max or if numbers in the
+ * interval cannot be expressed with the given fixed number of
+ * digits
+ */
+ public static Automaton makeInterval(int min, int max, int digits)
+ throws IllegalArgumentException {
+ Automaton a = new Automaton();
+ String x = Integer.toString(min);
+ String y = Integer.toString(max);
+ if (min > max || (digits > 0 && y.length() > digits)) throw new IllegalArgumentException();
+ int d;
+ if (digits > 0) d = digits;
+ else d = y.length();
+ StringBuilder bx = new StringBuilder();
+ for (int i = x.length(); i < d; i++)
+ bx.append('0');
+ bx.append(x);
+ x = bx.toString();
+ StringBuilder by = new StringBuilder();
+ for (int i = y.length(); i < d; i++)
+ by.append('0');
+ by.append(y);
+ y = by.toString();
+ Collection initials = new ArrayList();
+ a.initial = between(x, y, 0, initials, digits <= 0);
+ if (digits <= 0) {
+ ArrayList pairs = new ArrayList();
+ for (State p : initials)
+ if (a.initial != p) pairs.add(new StatePair(a.initial, p));
+ BasicOperations.addEpsilons(a, pairs);
+ a.initial.addTransition(new Transition('0', a.initial));
+ a.deterministic = false;
+ } else a.deterministic = true;
+ a.checkMinimizeAlways();
+ return a;
+ }
+
+ /**
+ * Returns a new (deterministic) automaton that accepts the single given
+ * string.
+ */
+ public static Automaton makeString(String s) {
+ Automaton a = new Automaton();
+ a.singleton = s;
+ a.deterministic = true;
+ return a;
+ }
+
+ /**
+ * Constructs automaton that accept strings representing nonnegative integers
+ * that are not larger than the given value.
+ *
+ * @param n string representation of maximum value
+ */
+ public static Automaton makeMaxInteger(String n) {
+ int i = 0;
+ while (i < n.length() && n.charAt(i) == '0')
+ i++;
+ StringBuilder b = new StringBuilder();
+ b.append("0*(0|");
+ if (i < n.length()) b.append("[0-9]{1," + (n.length() - i - 1) + "}|");
+ maxInteger(n.substring(i), 0, b);
+ b.append(")");
+ return Automaton.minimize((new RegExp(b.toString())).toAutomaton());
+ }
+
+ private static void maxInteger(String n, int i, StringBuilder b) {
+ b.append('(');
+ if (i < n.length()) {
+ char c = n.charAt(i);
+ if (c != '0') b.append("[0-" + (char) (c - 1) + "][0-9]{"
+ + (n.length() - i - 1) + "}|");
+ b.append(c);
+ maxInteger(n, i + 1, b);
+ }
+ b.append(')');
+ }
+
+ /**
+ * Constructs automaton that accept strings representing nonnegative integers
+ * that are not less that the given value.
+ *
+ * @param n string representation of minimum value
+ */
+ public static Automaton makeMinInteger(String n) {
+ int i = 0;
+ while (i + 1 < n.length() && n.charAt(i) == '0')
+ i++;
+ StringBuilder b = new StringBuilder();
+ b.append("0*");
+ minInteger(n.substring(i), 0, b);
+ b.append("[0-9]*");
+ return Automaton.minimize((new RegExp(b.toString())).toAutomaton());
+ }
+
+ private static void minInteger(String n, int i, StringBuilder b) {
+ b.append('(');
+ if (i < n.length()) {
+ char c = n.charAt(i);
+ if (c != '9') b.append("[" + (char) (c + 1) + "-9][0-9]{"
+ + (n.length() - i - 1) + "}|");
+ b.append(c);
+ minInteger(n, i + 1, b);
+ }
+ b.append(')');
+ }
+
+ /**
+ * Constructs automaton that accept strings representing decimal numbers that
+ * can be written with at most the given number of digits. Surrounding
+ * whitespace is permitted.
+ *
+ * @param i max number of necessary digits
+ */
+ public static Automaton makeTotalDigits(int i) {
+ return Automaton.minimize((new RegExp("[ \t\n\r]*[-+]?0*([0-9]{0," + i
+ + "}|((([0-9]\\.*){0," + i + "})&@\\.@)0*)[ \t\n\r]*")).toAutomaton());
+ }
+
+ /**
+ * Constructs automaton that accept strings representing decimal numbers that
+ * can be written with at most the given number of digits in the fraction
+ * part. Surrounding whitespace is permitted.
+ *
+ * @param i max number of necessary fraction digits
+ */
+ public static Automaton makeFractionDigits(int i) {
+ return Automaton.minimize((new RegExp("[ \t\n\r]*[-+]?[0-9]+(\\.[0-9]{0,"
+ + i + "}0*)?[ \t\n\r]*")).toAutomaton());
+ }
+
+ /**
+ * Constructs automaton that accept strings representing the given integer.
+ * Surrounding whitespace is permitted.
+ *
+ * @param value string representation of integer
+ */
+ public static Automaton makeIntegerValue(String value) {
+ boolean minus = false;
+ int i = 0;
+ while (i < value.length()) {
+ char c = value.charAt(i);
+ if (c == '-') minus = true;
+ if (c >= '1' && c <= '9') break;
+ i++;
+ }
+ StringBuilder b = new StringBuilder();
+ b.append(value.substring(i));
+ if (b.length() == 0) b.append("0");
+ Automaton s;
+ if (minus) s = makeChar('-');
+ else s = makeChar('+').optional();
+ Automaton ws = getWhitespaceAutomaton();
+ return Automaton.minimize(ws.concatenate(
+ s.concatenate(makeChar('0').repeat()).concatenate(
+ makeString(b.toString()))).concatenate(ws));
+ }
+
+ /**
+ * Constructs automaton that accept strings representing the given decimal
+ * number. Surrounding whitespace is permitted.
+ *
+ * @param value string representation of decimal number
+ */
+ public static Automaton makeDecimalValue(String value) {
+ boolean minus = false;
+ int i = 0;
+ while (i < value.length()) {
+ char c = value.charAt(i);
+ if (c == '-') minus = true;
+ if ((c >= '1' && c <= '9') || c == '.') break;
+ i++;
+ }
+ StringBuilder b1 = new StringBuilder();
+ StringBuilder b2 = new StringBuilder();
+ int p = value.indexOf('.', i);
+ if (p == -1) b1.append(value.substring(i));
+ else {
+ b1.append(value.substring(i, p));
+ i = value.length() - 1;
+ while (i > p) {
+ char c = value.charAt(i);
+ if (c >= '1' && c <= '9') break;
+ i--;
+ }
+ b2.append(value.substring(p + 1, i + 1));
+ }
+ if (b1.length() == 0) b1.append("0");
+ Automaton s;
+ if (minus) s = makeChar('-');
+ else s = makeChar('+').optional();
+ Automaton d;
+ if (b2.length() == 0) d = makeChar('.')
+ .concatenate(makeChar('0').repeat(1)).optional();
+ else d = makeChar('.').concatenate(makeString(b2.toString())).concatenate(
+ makeChar('0').repeat());
+ Automaton ws = getWhitespaceAutomaton();
+ return Automaton.minimize(ws.concatenate(
+ s.concatenate(makeChar('0').repeat()).concatenate(
+ makeString(b1.toString())).concatenate(d)).concatenate(ws));
+ }
+
+ /**
+ * Constructs deterministic automaton that matches strings that contain the
+ * given substring.
+ */
+ public static Automaton makeStringMatcher(String s) {
+ Automaton a = new Automaton();
+ State[] states = new State[s.length() + 1];
+ states[0] = a.initial;
+ for (int i = 0; i < s.length(); i++)
+ states[i + 1] = new State();
+ State f = states[s.length()];
+ f.accept = true;
+ f.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+ f));
+ for (int i = 0; i < s.length(); i++) {
+ Set done = new HashSet();
+ char c = s.charAt(i);
+ states[i].transitions.add(new Transition(c, states[i + 1]));
+ done.add(c);
+ for (int j = i; j >= 1; j--) {
+ char d = s.charAt(j - 1);
+ if (!done.contains(d)
+ && s.substring(0, j - 1).equals(s.substring(i - j + 1, i))) {
+ states[i].transitions.add(new Transition(d, states[j]));
+ done.add(d);
+ }
+ }
+ char[] da = new char[done.size()];
+ int h = 0;
+ for (char w : done)
+ da[h++] = w;
+ Arrays.sort(da);
+ int from = Character.MIN_VALUE;
+ int k = 0;
+ while (from <= Character.MAX_VALUE) {
+ while (k < da.length && da[k] == from) {
+ k++;
+ from++;
+ }
+ if (from <= Character.MAX_VALUE) {
+ int to = Character.MAX_VALUE;
+ if (k < da.length) {
+ to = da[k] - 1;
+ k++;
+ }
+ states[i].transitions.add(new Transition((char) from, (char) to,
+ states[0]));
+ from = to + 2;
+ }
+ }
+ }
+ a.deterministic = true;
+ return a;
+ }
+
+ private static Automaton getWhitespaceAutomaton() {
+ return ws;
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\BasicAutomata.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/BasicOperations.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 0)
@@ -0,0 +1,624 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Basic automata operations.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class BasicOperations {
+
+ private BasicOperations() {}
+
+ /**
+ * Returns an automaton that accepts the concatenation of the languages of the
+ * given automata.
+ *
+ * Complexity: linear in number of states.
+ */
+ static public Automaton concatenate(Automaton a1, Automaton a2) {
+ if (a1.isSingleton() && a2.isSingleton()) return BasicAutomata
+ .makeString(a1.singleton + a2.singleton);
+ if (a1 == a2) {
+ a1 = a1.cloneExpanded();
+ a2 = a2.cloneExpanded();
+ } else {
+ a1 = a1.cloneExpandedIfRequired();
+ a2 = a2.cloneExpandedIfRequired();
+ }
+ for (State s : a1.getAcceptStates()) {
+ s.accept = false;
+ s.addEpsilon(a2.initial);
+ }
+ a1.deterministic = false;
+ a1.clearHashCode();
+ a1.checkMinimizeAlways();
+ return a1;
+ }
+
+ /**
+ * Returns an automaton that accepts the concatenation of the languages of the
+ * given automata.
+ *
+ * Complexity: linear in total number of states.
+ */
+ static public Automaton concatenate(List l) {
+ if (l.isEmpty()) return BasicAutomata.makeEmptyString();
+ boolean all_singleton = true;
+ for (Automaton a : l)
+ if (!a.isSingleton()) {
+ all_singleton = false;
+ break;
+ }
+ if (all_singleton) {
+ StringBuilder b = new StringBuilder();
+ for (Automaton a : l)
+ b.append(a.singleton);
+ return BasicAutomata.makeString(b.toString());
+ } else {
+ for (Automaton a : l)
+ if (BasicOperations.isEmpty(a)) return BasicAutomata.makeEmpty();
+ Set ids = new HashSet();
+ for (Automaton a : l)
+ ids.add(System.identityHashCode(a));
+ boolean has_aliases = ids.size() != l.size();
+ Automaton b = l.get(0);
+ if (has_aliases) b = b.cloneExpanded();
+ else b = b.cloneExpandedIfRequired();
+ Set ac = b.getAcceptStates();
+ boolean first = true;
+ for (Automaton a : l)
+ if (first) first = false;
+ else {
+ if (a.isEmptyString()) continue;
+ Automaton aa = a;
+ if (has_aliases) aa = aa.cloneExpanded();
+ else aa = aa.cloneExpandedIfRequired();
+ Set ns = aa.getAcceptStates();
+ for (State s : ac) {
+ s.accept = false;
+ s.addEpsilon(aa.initial);
+ if (s.accept) ns.add(s);
+ }
+ ac = ns;
+ }
+ b.deterministic = false;
+ b.clearHashCode();
+ b.checkMinimizeAlways();
+ return b;
+ }
+ }
+
+ /**
+ * Returns an automaton that accepts the union of the empty string and the
+ * language of the given automaton.
+ *
+ * Complexity: linear in number of states.
+ */
+ static public Automaton optional(Automaton a) {
+ a = a.cloneExpandedIfRequired();
+ State s = new State();
+ s.addEpsilon(a.initial);
+ s.accept = true;
+ a.initial = s;
+ a.deterministic = false;
+ a.clearHashCode();
+ a.checkMinimizeAlways();
+ return a;
+ }
+
+ /**
+ * Returns an automaton that accepts the Kleene star (zero or more
+ * concatenated repetitions) of the language of the given automaton. Never
+ * modifies the input automaton language.
+ *
+ * Complexity: linear in number of states.
+ */
+ static public Automaton repeat(Automaton a) {
+ a = a.cloneExpanded();
+ State s = new State();
+ s.accept = true;
+ s.addEpsilon(a.initial);
+ for (State p : a.getAcceptStates())
+ p.addEpsilon(s);
+ a.initial = s;
+ a.deterministic = false;
+ a.clearHashCode();
+ a.checkMinimizeAlways();
+ return a;
+ }
+
+ /**
+ * Returns an automaton that accepts min or more concatenated
+ * repetitions of the language of the given automaton.
+ *
+ * Complexity: linear in number of states and in min.
+ */
+ static public Automaton repeat(Automaton a, int min) {
+ if (min == 0) return repeat(a);
+ List as = new ArrayList();
+ while (min-- > 0)
+ as.add(a);
+ as.add(repeat(a));
+ return concatenate(as);
+ }
+
+ /**
+ * Returns an automaton that accepts between min and
+ * max (including both) concatenated repetitions of the language
+ * of the given automaton.
+ *
+ * Complexity: linear in number of states and in min and
+ * max.
+ */
+ static public Automaton repeat(Automaton a, int min, int max) {
+ if (min > max) return BasicAutomata.makeEmpty();
+ max -= min;
+ a.expandSingleton();
+ Automaton b;
+ if (min == 0) b = BasicAutomata.makeEmptyString();
+ else if (min == 1) b = a.clone();
+ else {
+ List as = new ArrayList();
+ while (min-- > 0)
+ as.add(a);
+ b = concatenate(as);
+ }
+ if (max > 0) {
+ Automaton d = a.clone();
+ while (--max > 0) {
+ Automaton c = a.clone();
+ for (State p : c.getAcceptStates())
+ p.addEpsilon(d.initial);
+ d = c;
+ }
+ for (State p : b.getAcceptStates())
+ p.addEpsilon(d.initial);
+ b.deterministic = false;
+ b.clearHashCode();
+ b.checkMinimizeAlways();
+ }
+ return b;
+ }
+
+ /**
+ * Returns a (deterministic) automaton that accepts the complement of the
+ * language of the given automaton.
+ *
+ * Complexity: linear in number of states (if already deterministic).
+ */
+ static public Automaton complement(Automaton a) {
+ a = a.cloneExpandedIfRequired();
+ a.determinize();
+ a.totalize();
+ for (State p : a.getStates())
+ p.accept = !p.accept;
+ a.removeDeadTransitions();
+ return a;
+ }
+
+ /**
+ * Returns a (deterministic) automaton that accepts the intersection of the
+ * language of a1 and the complement of the language of
+ * a2. As a side-effect, the automata may be determinized, if not
+ * already deterministic.
+ *
+ * Complexity: quadratic in number of states (if already deterministic).
+ */
+ static public Automaton minus(Automaton a1, Automaton a2) {
+ if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata
+ .makeEmpty();
+ if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired();
+ if (a1.isSingleton()) {
+ if (BasicOperations.run(a2, a1.singleton)) return BasicAutomata.makeEmpty();
+ else return a1.cloneIfRequired();
+ }
+ return intersection(a1, a2.complement());
+ }
+
+ /**
+ * Returns an automaton that accepts the intersection of the languages of the
+ * given automata. Never modifies the input automata languages.
+ *
+ * Complexity: quadratic in number of states.
+ */
+ static public Automaton intersection(Automaton a1, Automaton a2) {
+ if (a1.isSingleton()) {
+ if (BasicOperations.run(a2, a1.singleton)) return a1.cloneIfRequired();
+ else return BasicAutomata.makeEmpty();
+ }
+ if (a2.isSingleton()) {
+ if (BasicOperations.run(a1, a2.singleton)) return a2.cloneIfRequired();
+ else return BasicAutomata.makeEmpty();
+ }
+ if (a1 == a2) return a1.cloneIfRequired();
+ Transition[][] transitions1 = Automaton
+ .getSortedTransitions(a1.getStates());
+ Transition[][] transitions2 = Automaton
+ .getSortedTransitions(a2.getStates());
+ Automaton c = new Automaton();
+ LinkedList worklist = new LinkedList();
+ HashMap newstates = new HashMap();
+ StatePair p = new StatePair(c.initial, a1.initial, a2.initial);
+ worklist.add(p);
+ newstates.put(p, p);
+ while (worklist.size() > 0) {
+ p = worklist.removeFirst();
+ p.s.accept = p.s1.accept && p.s2.accept;
+ Transition[] t1 = transitions1[p.s1.number];
+ Transition[] t2 = transitions2[p.s2.number];
+ for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
+ while (b2 < t2.length && t2[b2].max < t1[n1].min)
+ b2++;
+ for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++)
+ if (t2[n2].max >= t1[n1].min) {
+ StatePair q = new StatePair(t1[n1].to, t2[n2].to);
+ StatePair r = newstates.get(q);
+ if (r == null) {
+ q.s = new State();
+ worklist.add(q);
+ newstates.put(q, q);
+ r = q;
+ }
+ char min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min;
+ char max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max;
+ p.s.transitions.add(new Transition(min, max, r.s));
+ }
+ }
+ }
+ c.deterministic = a1.deterministic && a2.deterministic;
+ c.removeDeadTransitions();
+ c.checkMinimizeAlways();
+ return c;
+ }
+
+ /**
+ * Returns true if the language of a1 is a subset of the language
+ * of a2. As a side-effect, a2 is determinized if
+ * not already marked as deterministic.
+ *
+ * Complexity: quadratic in number of states.
+ */
+ public static boolean subsetOf(Automaton a1, Automaton a2) {
+ if (a1 == a2) return true;
+ if (a1.isSingleton()) {
+ if (a2.isSingleton()) return a1.singleton.equals(a2.singleton);
+ return BasicOperations.run(a2, a1.singleton);
+ }
+ a2.determinize();
+ Transition[][] transitions1 = Automaton
+ .getSortedTransitions(a1.getStates());
+ Transition[][] transitions2 = Automaton
+ .getSortedTransitions(a2.getStates());
+ LinkedList worklist = new LinkedList();
+ HashSet visited = new HashSet();
+ StatePair p = new StatePair(a1.initial, a2.initial);
+ worklist.add(p);
+ visited.add(p);
+ while (worklist.size() > 0) {
+ p = worklist.removeFirst();
+ if (p.s1.accept && !p.s2.accept) return false;
+ Transition[] t1 = transitions1[p.s1.number];
+ Transition[] t2 = transitions2[p.s2.number];
+ for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
+ while (b2 < t2.length && t2[b2].max < t1[n1].min)
+ b2++;
+ int min1 = t1[n1].min, max1 = t1[n1].max;
+ for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) {
+ if (t2[n2].min > min1) return false;
+ if (t2[n2].max < Character.MAX_VALUE) min1 = t2[n2].max + 1;
+ else {
+ min1 = Character.MAX_VALUE;
+ max1 = Character.MIN_VALUE;
+ }
+ StatePair q = new StatePair(t1[n1].to, t2[n2].to);
+ if (!visited.contains(q)) {
+ worklist.add(q);
+ visited.add(q);
+ }
+ }
+ if (min1 <= max1) return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Returns an automaton that accepts the union of the languages of the given
+ * automata.
+ *
+ * Complexity: linear in number of states.
+ */
+ public static Automaton union(Automaton a1, Automaton a2) {
+ if ((a1.isSingleton() && a2.isSingleton() && a1.singleton
+ .equals(a2.singleton))
+ || a1 == a2) return a1.cloneIfRequired();
+ if (a1 == a2) {
+ a1 = a1.cloneExpanded();
+ a2 = a2.cloneExpanded();
+ } else {
+ a1 = a1.cloneExpandedIfRequired();
+ a2 = a2.cloneExpandedIfRequired();
+ }
+ State s = new State();
+ s.addEpsilon(a1.initial);
+ s.addEpsilon(a2.initial);
+ a1.initial = s;
+ a1.deterministic = false;
+ a1.clearHashCode();
+ a1.checkMinimizeAlways();
+ return a1;
+ }
+
+ /**
+ * Returns an automaton that accepts the union of the languages of the given
+ * automata.
+ *
+ * Complexity: linear in number of states.
+ */
+ public static Automaton union(Collection l) {
+ Set ids = new HashSet();
+ for (Automaton a : l)
+ ids.add(System.identityHashCode(a));
+ boolean has_aliases = ids.size() != l.size();
+ State s = new State();
+ for (Automaton b : l) {
+ if (BasicOperations.isEmpty(b)) continue;
+ Automaton bb = b;
+ if (has_aliases) bb = bb.cloneExpanded();
+ else bb = bb.cloneExpandedIfRequired();
+ s.addEpsilon(bb.initial);
+ }
+ Automaton a = new Automaton();
+ a.initial = s;
+ a.deterministic = false;
+ a.clearHashCode();
+ a.checkMinimizeAlways();
+ return a;
+ }
+
+ /**
+ * Determinizes the given automaton.
+ *
+ * Complexity: exponential in number of states.
+ */
+ public static void determinize(Automaton a) {
+ if (a.deterministic || a.isSingleton()) return;
+ Set initialset = new HashSet();
+ initialset.add(a.initial);
+ determinize(a, initialset);
+ }
+
+ /**
+ * Determinizes the given automaton using the given set of initial states.
+ */
+ static void determinize(Automaton a, Set initialset) {
+ char[] points = a.getStartPoints();
+ // subset construction
+ Map,Set> sets = new HashMap,Set>();
+ LinkedList> worklist = new LinkedList>();
+ Map,State> newstate = new HashMap,State>();
+ sets.put(initialset, initialset);
+ worklist.add(initialset);
+ a.initial = new State();
+ newstate.put(initialset, a.initial);
+ while (worklist.size() > 0) {
+ Set s = worklist.removeFirst();
+ State r = newstate.get(s);
+ for (State q : s)
+ if (q.accept) {
+ r.accept = true;
+ break;
+ }
+ for (int n = 0; n < points.length; n++) {
+ Set p = new HashSet();
+ for (State q : s)
+ for (Transition t : q.transitions)
+ if (t.min <= points[n] && points[n] <= t.max) p.add(t.to);
+ if (!sets.containsKey(p)) {
+ sets.put(p, p);
+ worklist.add(p);
+ newstate.put(p, new State());
+ }
+ State q = newstate.get(p);
+ char min = points[n];
+ char max;
+ if (n + 1 < points.length) max = (char) (points[n + 1] - 1);
+ else max = Character.MAX_VALUE;
+ r.transitions.add(new Transition(min, max, q));
+ }
+ }
+ a.deterministic = true;
+ a.removeDeadTransitions();
+ }
+
+ /**
+ * Adds epsilon transitions to the given automaton. This method adds extra
+ * character interval transitions that are equivalent to the given set of
+ * epsilon transitions.
+ *
+ * @param pairs collection of {@link StatePair} objects representing pairs of
+ * source/destination states where epsilon transitions should be
+ * added
+ */
+ public static void addEpsilons(Automaton a, Collection pairs) {
+ a.expandSingleton();
+ HashMap> forward = new HashMap>();
+ HashMap> back = new HashMap>();
+ for (StatePair p : pairs) {
+ HashSet to = forward.get(p.s1);
+ if (to == null) {
+ to = new HashSet();
+ forward.put(p.s1, to);
+ }
+ to.add(p.s2);
+ HashSet from = back.get(p.s2);
+ if (from == null) {
+ from = new HashSet();
+ back.put(p.s2, from);
+ }
+ from.add(p.s1);
+ }
+ // calculate epsilon closure
+ LinkedList worklist = new LinkedList(pairs);
+ HashSet workset = new HashSet(pairs);
+ while (!worklist.isEmpty()) {
+ StatePair p = worklist.removeFirst();
+ workset.remove(p);
+ HashSet to = forward.get(p.s2);
+ HashSet from = back.get(p.s1);
+ if (to != null) {
+ for (State s : to) {
+ StatePair pp = new StatePair(p.s1, s);
+ if (!pairs.contains(pp)) {
+ pairs.add(pp);
+ forward.get(p.s1).add(s);
+ back.get(s).add(p.s1);
+ worklist.add(pp);
+ workset.add(pp);
+ if (from != null) {
+ for (State q : from) {
+ StatePair qq = new StatePair(q, p.s1);
+ if (!workset.contains(qq)) {
+ worklist.add(qq);
+ workset.add(qq);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ // add transitions
+ for (StatePair p : pairs)
+ p.s1.addEpsilon(p.s2);
+ a.deterministic = false;
+ a.clearHashCode();
+ a.checkMinimizeAlways();
+ }
+
+ /**
+ * Returns true if the given automaton accepts the empty string and nothing
+ * else.
+ */
+ public static boolean isEmptyString(Automaton a) {
+ if (a.isSingleton()) return a.singleton.length() == 0;
+ else return a.initial.accept && a.initial.transitions.isEmpty();
+ }
+
+ /**
+ * Returns true if the given automaton accepts no strings.
+ */
+ public static boolean isEmpty(Automaton a) {
+ if (a.isSingleton()) return false;
+ return !a.initial.accept && a.initial.transitions.isEmpty();
+ }
+
+ /**
+ * Returns true if the given automaton accepts all strings.
+ */
+ public static boolean isTotal(Automaton a) {
+ if (a.isSingleton()) return false;
+ if (a.initial.accept && a.initial.transitions.size() == 1) {
+ Transition t = a.initial.transitions.iterator().next();
+ return t.to == a.initial && t.min == Character.MIN_VALUE
+ && t.max == Character.MAX_VALUE;
+ }
+ return false;
+ }
+
+ /**
+ * Returns true if the given string is accepted by the automaton.
+ *
+ * Complexity: linear in the length of the string.
+ *
+ * Note: for full performance, use the {@link RunAutomaton} class.
+ */
+ public static boolean run(Automaton a, String s) {
+ if (a.isSingleton()) return s.equals(a.singleton);
+ if (a.deterministic) {
+ State p = a.initial;
+ for (int i = 0; i < s.length(); i++) {
+ State q = p.step(s.charAt(i));
+ if (q == null) return false;
+ p = q;
+ }
+ return p.accept;
+ } else {
+ Set states = a.getStates();
+ Automaton.setStateNumbers(states);
+ LinkedList pp = new LinkedList();
+ LinkedList pp_other = new LinkedList();
+ BitSet bb = new BitSet(states.size());
+ BitSet bb_other = new BitSet(states.size());
+ pp.add(a.initial);
+ ArrayList dest = new ArrayList();
+ boolean accept = a.initial.accept;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ accept = false;
+ pp_other.clear();
+ bb_other.clear();
+ for (State p : pp) {
+ dest.clear();
+ p.step(c, dest);
+ for (State q : dest) {
+ if (q.accept) accept = true;
+ if (!bb_other.get(q.number)) {
+ bb_other.set(q.number);
+ pp_other.add(q);
+ }
+ }
+ }
+ LinkedList tp = pp;
+ pp = pp_other;
+ pp_other = tp;
+ BitSet tb = bb;
+ bb = bb_other;
+ bb_other = tb;
+ }
+ return accept;
+ }
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\BasicOperations.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (revision 0)
@@ -0,0 +1,278 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.Set;
+
+/**
+ * Operations for minimizing automata.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class MinimizationOperations {
+
+ private MinimizationOperations() {}
+
+ /**
+ * Minimizes (and determinizes if not already deterministic) the given
+ * automaton.
+ *
+ * @see Automaton#setMinimization(int)
+ */
+ public static void minimize(Automaton a) {
+ if (!a.isSingleton()) {
+ minimizeHopcroft(a);
+ }
+ // recompute hash code
+ a.hash_code = a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2;
+ if (a.hash_code == 0) a.hash_code = 1;
+ }
+
+ private static void initialize(ArrayList list, int size) {
+ for (int i = 0; i < size; i++)
+ list.add(null);
+ }
+
+ /**
+ * Minimizes the given automaton using Hopcroft's algorithm.
+ */
+ public static void minimizeHopcroft(Automaton a) {
+ a.determinize();
+ Set tr = a.initial.getTransitions();
+ if (tr.size() == 1) {
+ Transition t = tr.iterator().next();
+ if (t.to == a.initial && t.min == Character.MIN_VALUE
+ && t.max == Character.MAX_VALUE) return;
+ }
+ a.totalize();
+ // make arrays for numbered states and effective alphabet
+ Set ss = a.getStates();
+ State[] states = new State[ss.size()];
+ int number = 0;
+ for (State q : ss) {
+ states[number] = q;
+ q.number = number++;
+ }
+ char[] sigma = a.getStartPoints();
+ // initialize data structures
+ ArrayList>> reverse = new ArrayList>>();
+ for (int q = 0; q < states.length; q++) {
+ ArrayList> v = new ArrayList>();
+ initialize(v, sigma.length);
+ reverse.add(v);
+ }
+ boolean[][] reverse_nonempty = new boolean[states.length][sigma.length];
+ ArrayList> partition = new ArrayList>();
+ initialize(partition, states.length);
+ int[] block = new int[states.length];
+ StateList[][] active = new StateList[states.length][sigma.length];
+ StateListNode[][] active2 = new StateListNode[states.length][sigma.length];
+ LinkedList pending = new LinkedList();
+ boolean[][] pending2 = new boolean[sigma.length][states.length];
+ ArrayList split = new ArrayList();
+ boolean[] split2 = new boolean[states.length];
+ ArrayList refine = new ArrayList();
+ boolean[] refine2 = new boolean[states.length];
+ ArrayList> splitblock = new ArrayList>();
+ initialize(splitblock, states.length);
+ for (int q = 0; q < states.length; q++) {
+ splitblock.set(q, new ArrayList());
+ partition.set(q, new LinkedList());
+ for (int x = 0; x < sigma.length; x++) {
+ reverse.get(q).set(x, new LinkedList());
+ active[q][x] = new StateList();
+ }
+ }
+ // find initial partition and reverse edges
+ for (int q = 0; q < states.length; q++) {
+ State qq = states[q];
+ int j;
+ if (qq.accept) j = 0;
+ else j = 1;
+ partition.get(j).add(qq);
+ block[qq.number] = j;
+ for (int x = 0; x < sigma.length; x++) {
+ char y = sigma[x];
+ State p = qq.step(y);
+ reverse.get(p.number).get(x).add(qq);
+ reverse_nonempty[p.number][x] = true;
+ }
+ }
+ // initialize active sets
+ for (int j = 0; j <= 1; j++)
+ for (int x = 0; x < sigma.length; x++)
+ for (State qq : partition.get(j))
+ if (reverse_nonempty[qq.number][x]) active2[qq.number][x] = active[j][x]
+ .add(qq);
+ // initialize pending
+ for (int x = 0; x < sigma.length; x++) {
+ int a0 = active[0][x].size;
+ int a1 = active[1][x].size;
+ int j;
+ if (a0 <= a1) j = 0;
+ else j = 1;
+ pending.add(new IntPair(j, x));
+ pending2[x][j] = true;
+ }
+ // process pending until fixed point
+ int k = 2;
+ while (!pending.isEmpty()) {
+ IntPair ip = pending.removeFirst();
+ int p = ip.n1;
+ int x = ip.n2;
+ pending2[x][p] = false;
+ // find states that need to be split off their blocks
+ for (StateListNode m = active[p][x].first; m != null; m = m.next)
+ for (State s : reverse.get(m.q.number).get(x))
+ if (!split2[s.number]) {
+ split2[s.number] = true;
+ split.add(s);
+ int j = block[s.number];
+ splitblock.get(j).add(s);
+ if (!refine2[j]) {
+ refine2[j] = true;
+ refine.add(j);
+ }
+ }
+ // refine blocks
+ for (int j : refine) {
+ if (splitblock.get(j).size() < partition.get(j).size()) {
+ LinkedList b1 = partition.get(j);
+ LinkedList b2 = partition.get(k);
+ for (State s : splitblock.get(j)) {
+ b1.remove(s);
+ b2.add(s);
+ block[s.number] = k;
+ for (int c = 0; c < sigma.length; c++) {
+ StateListNode sn = active2[s.number][c];
+ if (sn != null && sn.sl == active[j][c]) {
+ sn.remove();
+ active2[s.number][c] = active[k][c].add(s);
+ }
+ }
+ }
+ // update pending
+ for (int c = 0; c < sigma.length; c++) {
+ int aj = active[j][c].size;
+ int ak = active[k][c].size;
+ if (!pending2[c][j] && 0 < aj && aj <= ak) {
+ pending2[c][j] = true;
+ pending.add(new IntPair(j, c));
+ } else {
+ pending2[c][k] = true;
+ pending.add(new IntPair(k, c));
+ }
+ }
+ k++;
+ }
+ for (State s : splitblock.get(j))
+ split2[s.number] = false;
+ refine2[j] = false;
+ splitblock.get(j).clear();
+ }
+ split.clear();
+ refine.clear();
+ }
+ // make a new state for each equivalence class, set initial state
+ State[] newstates = new State[k];
+ for (int n = 0; n < newstates.length; n++) {
+ State s = new State();
+ newstates[n] = s;
+ for (State q : partition.get(n)) {
+ if (q == a.initial) a.initial = s;
+ s.accept = q.accept;
+ s.number = q.number; // select representative
+ q.number = n;
+ }
+ }
+ // build transitions and set acceptance
+ for (int n = 0; n < newstates.length; n++) {
+ State s = newstates[n];
+ s.accept = states[s.number].accept;
+ for (Transition t : states[s.number].transitions)
+ s.transitions.add(new Transition(t.min, t.max, newstates[t.to.number]));
+ }
+ a.removeDeadTransitions();
+ }
+
+ static class IntPair {
+
+ int n1, n2;
+
+ IntPair(int n1, int n2) {
+ this.n1 = n1;
+ this.n2 = n2;
+ }
+ }
+
+ static class StateList {
+
+ int size;
+
+ StateListNode first, last;
+
+ StateListNode add(State q) {
+ return new StateListNode(q, this);
+ }
+ }
+
+ static class StateListNode {
+
+ State q;
+
+ StateListNode next, prev;
+
+ StateList sl;
+
+ StateListNode(State q, StateList sl) {
+ this.q = q;
+ this.sl = sl;
+ if (sl.size++ == 0) sl.first = sl.last = this;
+ else {
+ sl.last.next = this;
+ prev = sl.last;
+ sl.last = this;
+ }
+ }
+
+ void remove() {
+ sl.size--;
+ if (sl.first == this) sl.first = next;
+ else prev.next = next;
+ if (sl.last == this) sl.last = prev;
+ else next.prev = prev;
+ }
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\MinimizationOperations.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/package.html
===================================================================
--- src/java/org/apache/lucene/util/automaton/package.html (revision 0)
+++ src/java/org/apache/lucene/util/automaton/package.html (revision 0)
@@ -0,0 +1,50 @@
+
+
+
+
+Finite-state automaton for regular expressions.
+
+This package contains a full DFA/NFA implementation with Unicode
+alphabet and support for all standard (and a number of non-standard)
+regular expression operations.
+
+The most commonly used functionality is located in the classes
+{@link org.apache.lucene.util.automaton.Automaton} and
+{@link org.apache.lucene.util.automaton.RegExp}.
+
+For more information, go to the package home page at
+http://www.brics.dk/automaton/.
+
+WARNING: The status of the Automaton feature is experimental.
+The APIs introduced here might change in the future and will not be
+supported anymore in such a case.
+
+
Property changes on: src\java\org\apache\lucene\util\automaton\package.html
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/RegExp.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/RegExp.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/RegExp.java (revision 0)
@@ -0,0 +1,1003 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Regular Expression extension to Automaton.
+ *
+ * Regular expressions are built from the following abstract syntax:
+ *
+ *
+ *
+ * | regexp |
+ * ::= |
+ * unionexp |
+ * |
+ * |
+ *
+ *
+ * |
+ * | |
+ * |
+ * |
+ * |
+ *
+ *
+ *
+ * | unionexp |
+ * ::= |
+ * interexp | unionexp |
+ * (union) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * interexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | interexp |
+ * ::= |
+ * concatexp & interexp |
+ * (intersection) |
+ * [OPTIONAL] |
+ *
+ *
+ * |
+ * | |
+ * concatexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | concatexp |
+ * ::= |
+ * repeatexp concatexp |
+ * (concatenation) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | repeatexp |
+ * ::= |
+ * repeatexp ? |
+ * (zero or one occurrence) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp * |
+ * (zero or more occurrences) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp + |
+ * (one or more occurrences) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp {n} |
+ * (n occurrences) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp {n,} |
+ * (n or more occurrences) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * repeatexp {n,m} |
+ * (n to m occurrences, including both) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * complexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | complexp |
+ * ::= |
+ * ~ complexp |
+ * (complement) |
+ * [OPTIONAL] |
+ *
+ *
+ * |
+ * | |
+ * charclassexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | charclassexp |
+ * ::= |
+ * [ charclasses ] |
+ * (character class) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * [^ charclasses ] |
+ * (negated character class) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * simpleexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | charclasses |
+ * ::= |
+ * charclass charclasses |
+ * |
+ * |
+ *
+ *
+ * |
+ * | |
+ * charclass |
+ * |
+ * |
+ *
+ *
+ *
+ * | charclass |
+ * ::= |
+ * charexp - charexp |
+ * (character range, including end-points) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * charexp |
+ * |
+ * |
+ *
+ *
+ *
+ * | simpleexp |
+ * ::= |
+ * charexp |
+ * |
+ * |
+ *
+ *
+ * |
+ * | |
+ * . |
+ * (any single character) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * # |
+ * (the empty language) |
+ * [OPTIONAL] |
+ *
+ *
+ * |
+ * | |
+ * @ |
+ * (any string) |
+ * [OPTIONAL] |
+ *
+ *
+ * |
+ * | |
+ * " <Unicode string without double-quotes> " |
+ * (a string) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * ( ) |
+ * (the empty string) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * ( unionexp ) |
+ * (precedence override) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * < <identifier> > |
+ * (named automaton) |
+ * [OPTIONAL] |
+ *
+ *
+ * |
+ * | |
+ * <n-m> |
+ * (numerical interval) |
+ * [OPTIONAL] |
+ *
+ *
+ *
+ * | charexp |
+ * ::= |
+ * <Unicode character> |
+ * (a single non-reserved character) |
+ * |
+ *
+ *
+ * |
+ * | |
+ * \ <Unicode character> |
+ * (a single character) |
+ * |
+ *
+ *
+ *
+ * The productions marked [OPTIONAL] are only allowed if
+ * specified by the syntax flags passed to the RegExp constructor.
+ * The reserved characters used in the (enabled) syntax must be escaped with
+ * backslash (\) or double-quotes ("..."). (In
+ * contrast to other regexp syntaxes, this is required also in character
+ * classes.) Be aware that dash (-) has a special meaning in
+ * charclass expressions. An identifier is a string not containing right
+ * angle bracket (>) or dash (-). Numerical
+ * intervals are specified by non-negative decimal integers and include both end
+ * points, and if n and m have the same number
+ * of digits, then the conforming strings must have that length (i.e. prefixed
+ * by 0's).
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class RegExp {
+
+ enum Kind {
+ REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL
+ }
+
+ /**
+ * Syntax flag, enables intersection (&).
+ */
+ public static final int INTERSECTION = 0x0001;
+
+ /**
+ * Syntax flag, enables complement (~).
+ */
+ public static final int COMPLEMENT = 0x0002;
+
+ /**
+ * Syntax flag, enables empty language (#).
+ */
+ public static final int EMPTY = 0x0004;
+
+ /**
+ * Syntax flag, enables anystring (@).
+ */
+ public static final int ANYSTRING = 0x0008;
+
+ /**
+ * Syntax flag, enables named automata (<identifier>).
+ */
+ public static final int AUTOMATON = 0x0010;
+
+ /**
+ * Syntax flag, enables numerical intervals (
+ * <n-m>).
+ */
+ public static final int INTERVAL = 0x0020;
+
+ /**
+ * Syntax flag, enables all optional regexp syntax.
+ */
+ public static final int ALL = 0xffff;
+
+ /**
+ * Syntax flag, enables no optional regexp syntax.
+ */
+ public static final int NONE = 0x0000;
+
+ private static boolean allow_mutation = false;
+
+ Kind kind;
+ RegExp exp1, exp2;
+ String s;
+ char c;
+ int min, max, digits;
+ char from, to;
+
+ String b;
+ int flags;
+ int pos;
+
+ RegExp() {}
+
+ /**
+ * Constructs new RegExp from a string. Same as
+ * RegExp(s, ALL).
+ *
+ * @param s regexp string
+ * @exception IllegalArgumentException if an error occured while parsing the
+ * regular expression
+ */
+ public RegExp(String s) throws IllegalArgumentException {
+ this(s, ALL);
+ }
+
+ /**
+ * Constructs new RegExp from a string.
+ *
+ * @param s regexp string
+ * @param syntax_flags boolean 'or' of optional syntax constructs to be
+ * enabled
+ * @exception IllegalArgumentException if an error occured while parsing the
+ * regular expression
+ */
+ public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
+ b = s;
+ flags = syntax_flags;
+ RegExp e;
+ if (s.length() == 0) e = makeString("");
+ else {
+ e = parseUnionExp();
+ if (pos < b.length()) throw new IllegalArgumentException(
+ "end-of-string expected at position " + pos);
+ }
+ kind = e.kind;
+ exp1 = e.exp1;
+ exp2 = e.exp2;
+ this.s = e.s;
+ c = e.c;
+ min = e.min;
+ max = e.max;
+ digits = e.digits;
+ from = e.from;
+ to = e.to;
+ b = null;
+ }
+
+ /**
+ * Constructs new Automaton from this RegExp. Same
+ * as toAutomaton(null) (empty automaton map).
+ */
+ public Automaton toAutomaton() {
+ return toAutomatonAllowMutate(null, null);
+ }
+
+ /**
+ * Constructs new Automaton from this RegExp. The
+ * constructed automaton is minimal and deterministic and has no transitions
+ * to dead states.
+ *
+ * @param automaton_provider provider of automata for named identifiers
+ * @exception IllegalArgumentException if this regular expression uses a named
+ * identifier that is not available from the automaton provider
+ */
+ public Automaton toAutomaton(AutomatonProvider automaton_provider)
+ throws IllegalArgumentException {
+ return toAutomatonAllowMutate(null, automaton_provider);
+ }
+
+ /**
+ * Constructs new Automaton from this RegExp. The
+ * constructed automaton is minimal and deterministic and has no transitions
+ * to dead states.
+ *
+ * @param automata a map from automaton identifiers to automata (of type
+ * Automaton).
+ * @exception IllegalArgumentException if this regular expression uses a named
+ * identifier that does not occur in the automaton map
+ */
+ public Automaton toAutomaton(Map automata)
+ throws IllegalArgumentException {
+ return toAutomatonAllowMutate(automata, null);
+ }
+
+ /**
+ * Sets or resets allow mutate flag. If this flag is set, then automata
+ * construction uses mutable automata, which is slightly faster but not thread
+ * safe. By default, the flag is not set.
+ *
+ * @param flag if true, the flag is set
+ * @return previous value of the flag
+ */
+ public boolean setAllowMutate(boolean flag) {
+ boolean b = allow_mutation;
+ allow_mutation = flag;
+ return b;
+ }
+
+ private Automaton toAutomatonAllowMutate(Map automata,
+ AutomatonProvider automaton_provider) throws IllegalArgumentException {
+ boolean b = false;
+ if (allow_mutation) b = Automaton.setAllowMutate(true); // thread unsafe
+ Automaton a = toAutomaton(automata, automaton_provider);
+ if (allow_mutation) Automaton.setAllowMutate(b);
+ return a;
+ }
+
+ private Automaton toAutomaton(Map automata,
+ AutomatonProvider automaton_provider) throws IllegalArgumentException {
+ List list;
+ Automaton a = null;
+ switch (kind) {
+ case REGEXP_UNION:
+ list = new ArrayList();
+ findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider);
+ findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider);
+ a = BasicOperations.union(list);
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_CONCATENATION:
+ list = new ArrayList();
+ findLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata,
+ automaton_provider);
+ findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata,
+ automaton_provider);
+ a = BasicOperations.concatenate(list);
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_INTERSECTION:
+ a = exp1.toAutomaton(automata, automaton_provider).intersection(
+ exp2.toAutomaton(automata, automaton_provider));
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_OPTIONAL:
+ a = exp1.toAutomaton(automata, automaton_provider).optional();
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_REPEAT:
+ a = exp1.toAutomaton(automata, automaton_provider).repeat();
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_REPEAT_MIN:
+ a = exp1.toAutomaton(automata, automaton_provider).repeat(min);
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_REPEAT_MINMAX:
+ a = exp1.toAutomaton(automata, automaton_provider).repeat(min, max);
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_COMPLEMENT:
+ a = exp1.toAutomaton(automata, automaton_provider).complement();
+ MinimizationOperations.minimize(a);
+ break;
+ case REGEXP_CHAR:
+ a = BasicAutomata.makeChar(c);
+ break;
+ case REGEXP_CHAR_RANGE:
+ a = BasicAutomata.makeCharRange(from, to);
+ break;
+ case REGEXP_ANYCHAR:
+ a = BasicAutomata.makeAnyChar();
+ break;
+ case REGEXP_EMPTY:
+ a = BasicAutomata.makeEmpty();
+ break;
+ case REGEXP_STRING:
+ a = BasicAutomata.makeString(s);
+ break;
+ case REGEXP_ANYSTRING:
+ a = BasicAutomata.makeAnyString();
+ break;
+ case REGEXP_AUTOMATON:
+ Automaton aa = null;
+ if (automata != null) aa = automata.get(s);
+ if (aa == null && automaton_provider != null) try {
+ aa = automaton_provider.getAutomaton(s);
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ if (aa == null) throw new IllegalArgumentException("'" + s
+ + "' not found");
+ a = aa.clone(); // always clone here (ignore allow_mutate)
+ break;
+ case REGEXP_INTERVAL:
+ a = BasicAutomata.makeInterval(min, max, digits);
+ break;
+ }
+ return a;
+ }
+
+ private void findLeaves(RegExp exp, Kind kind, List list,
+ Map automata, AutomatonProvider automaton_provider) {
+ if (exp.kind == kind) {
+ findLeaves(exp.exp1, kind, list, automata, automaton_provider);
+ findLeaves(exp.exp2, kind, list, automata, automaton_provider);
+ } else list.add(exp.toAutomaton(automata, automaton_provider));
+ }
+
+ /**
+ * Constructs string from parsed regular expression.
+ */
+ @Override
+ public String toString() {
+ return toStringBuilder(new StringBuilder()).toString();
+ }
+
+ StringBuilder toStringBuilder(StringBuilder b) {
+ switch (kind) {
+ case REGEXP_UNION:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append("|");
+ exp2.toStringBuilder(b);
+ b.append(")");
+ break;
+ case REGEXP_CONCATENATION:
+ exp1.toStringBuilder(b);
+ exp2.toStringBuilder(b);
+ break;
+ case REGEXP_INTERSECTION:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append("&");
+ exp2.toStringBuilder(b);
+ b.append(")");
+ break;
+ case REGEXP_OPTIONAL:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append(")?");
+ break;
+ case REGEXP_REPEAT:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append(")*");
+ break;
+ case REGEXP_REPEAT_MIN:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append("){").append(min).append(",}");
+ break;
+ case REGEXP_REPEAT_MINMAX:
+ b.append("(");
+ exp1.toStringBuilder(b);
+ b.append("){").append(min).append(",").append(max).append("}");
+ break;
+ case REGEXP_COMPLEMENT:
+ b.append("~(");
+ exp1.toStringBuilder(b);
+ b.append(")");
+ break;
+ case REGEXP_CHAR:
+ b.append("\\").append(c);
+ break;
+ case REGEXP_CHAR_RANGE:
+ b.append("[\\").append(from).append("-\\").append(to).append("]");
+ break;
+ case REGEXP_ANYCHAR:
+ b.append(".");
+ break;
+ case REGEXP_EMPTY:
+ b.append("#");
+ break;
+ case REGEXP_STRING:
+ b.append("\"").append(s).append("\"");
+ break;
+ case REGEXP_ANYSTRING:
+ b.append("@");
+ break;
+ case REGEXP_AUTOMATON:
+ b.append("<").append(s).append(">");
+ break;
+ case REGEXP_INTERVAL:
+ String s1 = Integer.toString(min);
+ String s2 = Integer.toString(max);
+ b.append("<");
+ if (digits > 0) for (int i = s1.length(); i < digits; i++)
+ b.append('0');
+ b.append(s1).append("-");
+ if (digits > 0) for (int i = s2.length(); i < digits; i++)
+ b.append('0');
+ b.append(s2).append(">");
+ break;
+ }
+ return b;
+ }
+
+ /**
+ * Returns set of automaton identifiers that occur in this regular expression.
+ */
+ public Set getIdentifiers() {
+ HashSet set = new HashSet();
+ getIdentifiers(set);
+ return set;
+ }
+
+ void getIdentifiers(Set set) {
+ switch (kind) {
+ case REGEXP_UNION:
+ case REGEXP_CONCATENATION:
+ case REGEXP_INTERSECTION:
+ exp1.getIdentifiers(set);
+ exp2.getIdentifiers(set);
+ break;
+ case REGEXP_OPTIONAL:
+ case REGEXP_REPEAT:
+ case REGEXP_REPEAT_MIN:
+ case REGEXP_REPEAT_MINMAX:
+ case REGEXP_COMPLEMENT:
+ exp1.getIdentifiers(set);
+ break;
+ case REGEXP_AUTOMATON:
+ set.add(s);
+ break;
+ default:
+ }
+ }
+
+ static RegExp makeUnion(RegExp exp1, RegExp exp2) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_UNION;
+ r.exp1 = exp1;
+ r.exp2 = exp2;
+ return r;
+ }
+
+ static RegExp makeConcatenation(RegExp exp1, RegExp exp2) {
+ if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
+ && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString(
+ exp1, exp2);
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_CONCATENATION;
+ if (exp1.kind == Kind.REGEXP_CONCATENATION
+ && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING)
+ && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) {
+ r.exp1 = exp1.exp1;
+ r.exp2 = makeString(exp1.exp2, exp2);
+ } else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
+ && exp2.kind == Kind.REGEXP_CONCATENATION
+ && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) {
+ r.exp1 = makeString(exp1, exp2.exp1);
+ r.exp2 = exp2.exp2;
+ } else {
+ r.exp1 = exp1;
+ r.exp2 = exp2;
+ }
+ return r;
+ }
+
+ static private RegExp makeString(RegExp exp1, RegExp exp2) {
+ StringBuilder b = new StringBuilder();
+ if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s);
+ else b.append(exp1.c);
+ if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s);
+ else b.append(exp2.c);
+ return makeString(b.toString());
+ }
+
+ static RegExp makeIntersection(RegExp exp1, RegExp exp2) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_INTERSECTION;
+ r.exp1 = exp1;
+ r.exp2 = exp2;
+ return r;
+ }
+
+ static RegExp makeOptional(RegExp exp) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_OPTIONAL;
+ r.exp1 = exp;
+ return r;
+ }
+
+ static RegExp makeRepeat(RegExp exp) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_REPEAT;
+ r.exp1 = exp;
+ return r;
+ }
+
+ static RegExp makeRepeat(RegExp exp, int min) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_REPEAT_MIN;
+ r.exp1 = exp;
+ r.min = min;
+ return r;
+ }
+
+ static RegExp makeRepeat(RegExp exp, int min, int max) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_REPEAT_MINMAX;
+ r.exp1 = exp;
+ r.min = min;
+ r.max = max;
+ return r;
+ }
+
+ static RegExp makeComplement(RegExp exp) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_COMPLEMENT;
+ r.exp1 = exp;
+ return r;
+ }
+
+ static RegExp makeChar(char c) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_CHAR;
+ r.c = c;
+ return r;
+ }
+
+ static RegExp makeCharRange(char from, char to) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_CHAR_RANGE;
+ r.from = from;
+ r.to = to;
+ return r;
+ }
+
+ static RegExp makeAnyChar() {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_ANYCHAR;
+ return r;
+ }
+
+ static RegExp makeEmpty() {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_EMPTY;
+ return r;
+ }
+
+ static RegExp makeString(String s) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_STRING;
+ r.s = s;
+ return r;
+ }
+
+ static RegExp makeAnyString() {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_ANYSTRING;
+ return r;
+ }
+
+ static RegExp makeAutomaton(String s) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_AUTOMATON;
+ r.s = s;
+ return r;
+ }
+
+ static RegExp makeInterval(int min, int max, int digits) {
+ RegExp r = new RegExp();
+ r.kind = Kind.REGEXP_INTERVAL;
+ r.min = min;
+ r.max = max;
+ r.digits = digits;
+ return r;
+ }
+
+ private boolean peek(String s) {
+ return more() && s.indexOf(b.charAt(pos)) != -1;
+ }
+
+ private boolean match(char c) {
+ if (pos >= b.length()) return false;
+ if (b.charAt(pos) == c) {
+ pos++;
+ return true;
+ }
+ return false;
+ }
+
+ private boolean more() {
+ return pos < b.length();
+ }
+
+ private char next() throws IllegalArgumentException {
+ if (!more()) throw new IllegalArgumentException("unexpected end-of-string");
+ return b.charAt(pos++);
+ }
+
+ private boolean check(int flag) {
+ return (flags & flag) != 0;
+ }
+
+ final RegExp parseUnionExp() throws IllegalArgumentException {
+ RegExp e = parseInterExp();
+ if (match('|')) e = makeUnion(e, parseUnionExp());
+ return e;
+ }
+
+ final RegExp parseInterExp() throws IllegalArgumentException {
+ RegExp e = parseConcatExp();
+ if (check(INTERSECTION) && match('&')) e = makeIntersection(e,
+ parseInterExp());
+ return e;
+ }
+
+ final RegExp parseConcatExp() throws IllegalArgumentException {
+ RegExp e = parseRepeatExp();
+ if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation(
+ e, parseConcatExp());
+ return e;
+ }
+
+ final RegExp parseRepeatExp() throws IllegalArgumentException {
+ RegExp e = parseComplExp();
+ while (peek("?*+{")) {
+ if (match('?')) e = makeOptional(e);
+ else if (match('*')) e = makeRepeat(e);
+ else if (match('+')) e = makeRepeat(e, 1);
+ else if (match('{')) {
+ int start = pos;
+ while (peek("0123456789"))
+ next();
+ if (start == pos) throw new IllegalArgumentException(
+ "integer expected at position " + pos);
+ int n = Integer.parseInt(b.substring(start, pos));
+ int m = -1;
+ if (match(',')) {
+ start = pos;
+ while (peek("0123456789"))
+ next();
+ if (start != pos) m = Integer.parseInt(b.substring(start, pos));
+ } else m = n;
+ if (!match('}')) throw new IllegalArgumentException(
+ "expected '}' at position " + pos);
+ if (m == -1) e = makeRepeat(e, n);
+ else e = makeRepeat(e, n, m);
+ }
+ }
+ return e;
+ }
+
+ final RegExp parseComplExp() throws IllegalArgumentException {
+ if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp());
+ else return parseCharClassExp();
+ }
+
+ final RegExp parseCharClassExp() throws IllegalArgumentException {
+ if (match('[')) {
+ boolean negate = false;
+ if (match('^')) negate = true;
+ RegExp e = parseCharClasses();
+ if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e));
+ if (!match(']')) throw new IllegalArgumentException(
+ "expected ']' at position " + pos);
+ return e;
+ } else return parseSimpleExp();
+ }
+
+ final RegExp parseCharClasses() throws IllegalArgumentException {
+ RegExp e = parseCharClass();
+ while (more() && !peek("]"))
+ e = makeUnion(e, parseCharClass());
+ return e;
+ }
+
+ final RegExp parseCharClass() throws IllegalArgumentException {
+ char c = parseCharExp();
+ if (match('-')) return makeCharRange(c, parseCharExp());
+ else return makeChar(c);
+ }
+
+ final RegExp parseSimpleExp() throws IllegalArgumentException {
+ if (match('.')) return makeAnyChar();
+ else if (check(EMPTY) && match('#')) return makeEmpty();
+ else if (check(ANYSTRING) && match('@')) return makeAnyString();
+ else if (match('"')) {
+ int start = pos;
+ while (more() && !peek("\""))
+ next();
+ if (!match('"')) throw new IllegalArgumentException(
+ "expected '\"' at position " + pos);
+ return makeString(b.substring(start, pos - 1));
+ } else if (match('(')) {
+ if (match(')')) return makeString("");
+ RegExp e = parseUnionExp();
+ if (!match(')')) throw new IllegalArgumentException(
+ "expected ')' at position " + pos);
+ return e;
+ } else if ((check(AUTOMATON) || check(INTERVAL)) && match('<')) {
+ int start = pos;
+ while (more() && !peek(">"))
+ next();
+ if (!match('>')) throw new IllegalArgumentException(
+ "expected '>' at position " + pos);
+ String s = b.substring(start, pos - 1);
+ int i = s.indexOf('-');
+ if (i == -1) {
+ if (!check(AUTOMATON)) throw new IllegalArgumentException(
+ "interval syntax error at position " + (pos - 1));
+ return makeAutomaton(s);
+ } else {
+ if (!check(INTERVAL)) throw new IllegalArgumentException(
+ "illegal identifier at position " + (pos - 1));
+ try {
+ if (i == 0 || i == s.length() - 1 || i != s.lastIndexOf('-')) throw new NumberFormatException();
+ String smin = s.substring(0, i);
+ String smax = s.substring(i + 1, s.length());
+ int imin = Integer.parseInt(smin);
+ int imax = Integer.parseInt(smax);
+ int digits;
+ if (smin.length() == smax.length()) digits = smin.length();
+ else digits = 0;
+ if (imin > imax) {
+ int t = imin;
+ imin = imax;
+ imax = t;
+ }
+ return makeInterval(imin, imax, digits);
+ } catch (NumberFormatException e) {
+ throw new IllegalArgumentException(
+ "interval syntax error at position " + (pos - 1));
+ }
+ }
+ } else return makeChar(parseCharExp());
+ }
+
+ final char parseCharExp() throws IllegalArgumentException {
+ match('\\');
+ return next();
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\RegExp.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/RunAutomaton.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/RunAutomaton.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/RunAutomaton.java (revision 0)
@@ -0,0 +1,238 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Set;
+
+/**
+ * Finite-state automaton with fast run operation.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class RunAutomaton implements Serializable {
+
+ static final long serialVersionUID = 20001;
+
+ int size;
+ boolean[] accept;
+ int initial;
+ int[] transitions; // delta(state,c) = transitions[state*points.length +
+ // getCharClass(c)]
+ char[] points; // char interval start points
+ int[] classmap; // map from char number to class class
+
+ /**
+ * Sets alphabet table for optimal run performance.
+ */
+ final void setAlphabet() {
+ classmap = new int[Character.MAX_VALUE - Character.MIN_VALUE + 1];
+ int i = 0;
+ for (int j = 0; j <= Character.MAX_VALUE - Character.MIN_VALUE; j++) {
+ if (i + 1 < points.length && j == points[i + 1]) i++;
+ classmap[j] = i;
+ }
+ }
+
+ /**
+ * Returns a string representation of this automaton.
+ */
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ b.append("initial state: ").append(initial).append("\n");
+ for (int i = 0; i < size; i++) {
+ b.append("state " + i);
+ if (accept[i]) b.append(" [accept]:\n");
+ else b.append(" [reject]:\n");
+ for (int j = 0; j < points.length; j++) {
+ int k = transitions[i * points.length + j];
+ if (k != -1) {
+ char min = points[j];
+ char max;
+ if (j + 1 < points.length) max = (char) (points[j + 1] - 1);
+ else max = Character.MAX_VALUE;
+ b.append(" ");
+ Transition.appendCharString(min, b);
+ if (min != max) {
+ b.append("-");
+ Transition.appendCharString(max, b);
+ }
+ b.append(" -> ").append(k).append("\n");
+ }
+ }
+ }
+ return b.toString();
+ }
+
+ /**
+ * Returns number of states in automaton.
+ */
+ public int getSize() {
+ return size;
+ }
+
+ /**
+ * Returns acceptance status for given state.
+ */
+ public boolean isAccept(int state) {
+ return accept[state];
+ }
+
+ /**
+ * Returns initial state.
+ */
+ public int getInitialState() {
+ return initial;
+ }
+
+ /**
+ * Returns array of character class interval start points. The array should
+ * not be modified by the caller.
+ */
+ public char[] getCharIntervals() {
+ return points.clone();
+ }
+
+ /**
+ * Gets character class of given char.
+ */
+ int getCharClass(char c) {
+ return SpecialOperations.findIndex(c, points);
+ }
+
+ @SuppressWarnings("unused")
+ private RunAutomaton() {}
+
+ /**
+ * Constructs a new RunAutomaton from a deterministic
+ * Automaton. Same as RunAutomaton(a, true).
+ *
+ * @param a an automaton
+ */
+ public RunAutomaton(Automaton a) {
+ this(a, true);
+ }
+
+ /**
+ * Constructs a new RunAutomaton from a deterministic
+ * Automaton. If the given automaton is not deterministic, it is
+ * determinized first.
+ *
+ * @param a an automaton
+ * @param tableize if true, a transition table is created which makes the
+ * run method faster in return of a higher memory usage
+ */
+ public RunAutomaton(Automaton a, boolean tableize) {
+ a.determinize();
+ points = a.getStartPoints();
+ Set states = a.getStates();
+ Automaton.setStateNumbers(states);
+ initial = a.initial.number;
+ size = states.size();
+ accept = new boolean[size];
+ transitions = new int[size * points.length];
+ for (int n = 0; n < size * points.length; n++)
+ transitions[n] = -1;
+ for (State s : states) {
+ int n = s.number;
+ accept[n] = s.accept;
+ for (int c = 0; c < points.length; c++) {
+ State q = s.step(points[c]);
+ if (q != null) transitions[n * points.length + c] = q.number;
+ }
+ }
+ if (tableize) setAlphabet();
+ }
+
+ /**
+ * Returns the state obtained by reading the given char from the given state.
+ * Returns -1 if not obtaining any such state. (If the original
+ * Automaton had no dead states, -1 is returned here if and only
+ * if a dead state is entered in an equivalent automaton with a total
+ * transition function.)
+ */
+ public int step(int state, char c) {
+ if (classmap == null) return transitions[state * points.length
+ + getCharClass(c)];
+ else return transitions[state * points.length
+ + classmap[c - Character.MIN_VALUE]];
+ }
+
+ /**
+ * Returns true if the given string is accepted by this automaton.
+ */
+ public boolean run(String s) {
+ int p = initial;
+ int l = s.length();
+ for (int i = 0; i < l; i++) {
+ p = step(p, s.charAt(i));
+ if (p == -1) return false;
+ }
+ return accept[p];
+ }
+
+ /**
+ * Returns true if the given string is accepted by this automaton
+ */
+ public boolean run(char[] s, int offset, int length) {
+ int p = initial;
+ int l = offset + length;
+ for (int i = offset; i < l; i++) {
+ p = step(p, s[i]);
+ if (p == -1) return false;
+ }
+ return accept[p];
+ }
+
+ /**
+ * Returns the length of the longest accepted run of the given string starting
+ * at the given offset.
+ *
+ * @param s the string
+ * @param offset offset into s where the run starts
+ * @return length of the longest accepted run, -1 if no run is accepted
+ */
+ public int run(String s, int offset) {
+ int p = initial;
+ int l = s.length();
+ int max = -1;
+ for (int r = 0; offset <= l; offset++, r++) {
+ if (accept[p]) max = r;
+ if (offset == l) break;
+ p = step(p, s.charAt(offset));
+ if (p == -1) break;
+ }
+ return max;
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\RunAutomaton.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/SpecialOperations.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 0)
@@ -0,0 +1,182 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Special automata operations.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+final public class SpecialOperations {
+
+ private SpecialOperations() {}
+
+ /**
+ * Finds the largest entry whose value is less than or equal to c, or 0 if
+ * there is no such entry.
+ */
+ static int findIndex(char c, char[] points) {
+ int a = 0;
+ int b = points.length;
+ while (b - a > 1) {
+ int d = (a + b) >>> 1;
+ if (points[d] > c) b = d;
+ else if (points[d] < c) a = d;
+ else return d;
+ }
+ return a;
+ }
+
+ /**
+ * Returns true if the language of this automaton is finite.
+ */
+ public static boolean isFinite(Automaton a) {
+ if (a.isSingleton()) return true;
+ return isFinite(a.initial, new HashSet());
+ }
+
+ /**
+ * Checks whether there is a loop containing s. (This is sufficient since
+ * there are never transitions to dead states.)
+ */
+ private static boolean isFinite(State s, HashSet path) {
+ path.add(s);
+ for (Transition t : s.transitions)
+ if (path.contains(t.to) || !isFinite(t.to, path)) return false;
+ path.remove(s);
+ return true;
+ }
+
+ /**
+ * Returns the longest string that is a prefix of all accepted strings and
+ * visits each state at most once.
+ *
+ * @return common prefix
+ */
+ public static String getCommonPrefix(Automaton a) {
+ if (a.isSingleton()) return a.singleton;
+ StringBuilder b = new StringBuilder();
+ HashSet visited = new HashSet();
+ State s = a.initial;
+ boolean done;
+ do {
+ done = true;
+ visited.add(s);
+ if (!s.accept && s.transitions.size() == 1) {
+ Transition t = s.transitions.iterator().next();
+ if (t.min == t.max && !visited.contains(t.to)) {
+ b.append(t.min);
+ s = t.to;
+ done = false;
+ }
+ }
+ } while (!done);
+ return b.toString();
+ }
+
+ /**
+ * Returns the longest string that is a suffix of all accepted strings and
+ * visits each state at most once.
+ *
+ * @return common suffix
+ */
+ public static String getCommonSuffix(Automaton a) {
+ if (a.isSingleton()) // if singleton, the suffix is the string itself.
+ return a.singleton;
+
+ // reverse the language of the automaton, then reverse its common prefix.
+ Automaton r = a.clone();
+ reverse(r);
+ r.determinize();
+ return reverseUnicode3(SpecialOperations.getCommonPrefix(r));
+ }
+
+ /**
+ * Reverses the language of the given (non-singleton) automaton while returning
+ * the set of new initial states.
+ */
+ private static Set reverse(Automaton a) {
+ a.expandSingleton();
+ // reverse all edges
+ HashMap> m = new HashMap>();
+ Set states = a.getStates();
+ Set accept = a.getAcceptStates();
+ for (State r : states) {
+ m.put(r, new HashSet());
+ r.accept = false;
+ }
+ for (State r : states)
+ for (Transition t : r.getTransitions())
+ m.get(t.to).add(new Transition(t.min, t.max, r));
+ for (State r : states)
+ r.transitions = m.get(r);
+ // make new initial+final states
+ a.initial.accept = true;
+ a.initial = new State();
+ for (State r : accept)
+ a.initial.addEpsilon(r); // ensures that all initial states are reachable
+ a.deterministic = false;
+ return accept;
+ }
+
+ /**
+ * Intentionally use a unicode 3 reverse.
+ * This is because we are only going to reverse it again...
+ */
+ private static String reverseUnicode3( final String input ){
+ char[] charInput = input.toCharArray();
+ reverseUnicode3(charInput, 0, charInput.length);
+ return new String(charInput);
+ }
+
+ /**
+ * Intentionally use a unicode 3 reverse.
+ * This is because it is only used by getCommonSuffix(),
+ * which will reverse the entire FSM using code unit reversal,
+ * so we must then reverse its common prefix back using the
+ * same code point reversal.
+ */
+ private static void reverseUnicode3(char[] buffer, int start, int len){
+ if (len <= 1) return;
+ int num = len>>1;
+ for (int i = start; i < ( start + num ); i++) {
+ char c = buffer[i];
+ buffer[i] = buffer[start * 2 + len - i - 1];
+ buffer[start * 2 + len - i - 1] = c;
+ }
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\SpecialOperations.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/State.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/State.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/State.java (revision 0)
@@ -0,0 +1,202 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Automaton state.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class State implements Serializable, Comparable {
+
+ static final long serialVersionUID = 30001;
+
+ boolean accept;
+ Set transitions;
+
+ int number;
+
+ int id;
+ static int next_id;
+
+ /**
+ * Constructs a new state. Initially, the new state is a reject state.
+ */
+ public State() {
+ resetTransitions();
+ id = next_id++;
+ }
+
+ /**
+ * Resets transition set.
+ */
+ final void resetTransitions() {
+ transitions = new HashSet();
+ }
+
+ /**
+ * Returns the set of outgoing transitions. Subsequent changes are reflected
+ * in the automaton.
+ *
+ * @return transition set
+ */
+ public Set getTransitions() {
+ return transitions;
+ }
+
+ /**
+ * Adds an outgoing transition.
+ *
+ * @param t transition
+ */
+ public void addTransition(Transition t) {
+ transitions.add(t);
+ }
+
+ /**
+ * Sets acceptance for this state.
+ *
+ * @param accept if true, this state is an accept state
+ */
+ public void setAccept(boolean accept) {
+ this.accept = accept;
+ }
+
+ /**
+ * Returns acceptance status.
+ *
+ * @return true is this is an accept state
+ */
+ public boolean isAccept() {
+ return accept;
+ }
+
+ /**
+ * Performs lookup in transitions, assuming determinism.
+ *
+ * @param c character to look up
+ * @return destination state, null if no matching outgoing transition
+ * @see #step(char, Collection)
+ */
+ public State step(char c) {
+ for (Transition t : transitions)
+ if (t.min <= c && c <= t.max) return t.to;
+ return null;
+ }
+
+ /**
+ * Performs lookup in transitions, allowing nondeterminism.
+ *
+ * @param c character to look up
+ * @param dest collection where destination states are stored
+ * @see #step(char)
+ */
+ public void step(char c, Collection dest) {
+ for (Transition t : transitions)
+ if (t.min <= c && c <= t.max) dest.add(t.to);
+ }
+
+ void addEpsilon(State to) {
+ if (to.accept) accept = true;
+ for (Transition t : to.transitions)
+ transitions.add(t);
+ }
+
+ /**
+ * Returns transitions sorted by (min, reverse max, to) or (to, min, reverse
+ * max)
+ */
+ Transition[] getSortedTransitionArray(boolean to_first) {
+ Transition[] e = transitions.toArray(new Transition[transitions.size()]);
+ Arrays.sort(e, new TransitionComparator(to_first));
+ return e;
+ }
+
+ /**
+ * Returns sorted list of outgoing transitions.
+ *
+ * @param to_first if true, order by (to, min, reverse max); otherwise (min,
+ * reverse max, to)
+ * @return transition list
+ */
+ public List getSortedTransitions(boolean to_first) {
+ return Arrays.asList(getSortedTransitionArray(to_first));
+ }
+
+ /**
+ * Returns string describing this state. Normally invoked via
+ * {@link Automaton#toString()}.
+ */
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ b.append("state ").append(number);
+ if (accept) b.append(" [accept]");
+ else b.append(" [reject]");
+ b.append(":\n");
+ for (Transition t : transitions)
+ b.append(" ").append(t.toString()).append("\n");
+ return b.toString();
+ }
+
+ /**
+ * Compares this object with the specified object for order. States are
+ * ordered by the time of construction.
+ */
+ public int compareTo(State s) {
+ return s.id - id;
+ }
+
+ /**
+ * See {@link java.lang.Object#equals(java.lang.Object)}.
+ */
+ @Override
+ public boolean equals(Object obj) {
+ return super.equals(obj);
+ }
+
+ /**
+ * See {@link java.lang.Object#hashCode()}.
+ */
+ @Override
+ public int hashCode() {
+ return super.hashCode();
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\State.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/StatePair.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/StatePair.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/StatePair.java (revision 0)
@@ -0,0 +1,104 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+/**
+ * Pair of states.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class StatePair {
+ State s;
+ State s1;
+ State s2;
+
+ StatePair(State s, State s1, State s2) {
+ this.s = s;
+ this.s1 = s1;
+ this.s2 = s2;
+ }
+
+ /**
+ * Constructs a new state pair.
+ *
+ * @param s1 first state
+ * @param s2 second state
+ */
+ public StatePair(State s1, State s2) {
+ this.s1 = s1;
+ this.s2 = s2;
+ }
+
+ /**
+ * Returns first component of this pair.
+ *
+ * @return first state
+ */
+ public State getFirstState() {
+ return s1;
+ }
+
+ /**
+ * Returns second component of this pair.
+ *
+ * @return second state
+ */
+ public State getSecondState() {
+ return s2;
+ }
+
+ /**
+ * Checks for equality.
+ *
+ * @param obj object to compare with
+ * @return true if obj represents the same pair of states as this
+ * pair
+ */
+ @Override
+ public boolean equals(Object obj) {
+ if (obj instanceof StatePair) {
+ StatePair p = (StatePair) obj;
+ return p.s1 == s1 && p.s2 == s2;
+ } else return false;
+ }
+
+ /**
+ * Returns hash code.
+ *
+ * @return hash code
+ */
+ @Override
+ public int hashCode() {
+ return s1.hashCode() + s2.hashCode();
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\StatePair.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/Transition.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/Transition.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/Transition.java (revision 0)
@@ -0,0 +1,179 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+
+/**
+ * Automaton transition.
+ *
+ * A transition, which belongs to a source state, consists of a Unicode
+ * character interval and a destination state.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+public class Transition implements Serializable, Cloneable {
+
+ static final long serialVersionUID = 40001;
+
+ /*
+ * CLASS INVARIANT: min<=max
+ */
+
+ char min;
+ char max;
+
+ State to;
+
+ /**
+ * Constructs a new singleton interval transition.
+ *
+ * @param c transition character
+ * @param to destination state
+ */
+ public Transition(char c, State to) {
+ min = max = c;
+ this.to = to;
+ }
+
+ /**
+ * Constructs a new transition. Both end points are included in the interval.
+ *
+ * @param min transition interval minimum
+ * @param max transition interval maximum
+ * @param to destination state
+ */
+ public Transition(char min, char max, State to) {
+ if (max < min) {
+ char t = max;
+ max = min;
+ min = t;
+ }
+ this.min = min;
+ this.max = max;
+ this.to = to;
+ }
+
+ /** Returns minimum of this transition interval. */
+ public char getMin() {
+ return min;
+ }
+
+ /** Returns maximum of this transition interval. */
+ public char getMax() {
+ return max;
+ }
+
+ /** Returns destination of this transition. */
+ public State getDest() {
+ return to;
+ }
+
+ /**
+ * Checks for equality.
+ *
+ * @param obj object to compare with
+ * @return true if obj is a transition with same character interval
+ * and destination state as this transition.
+ */
+ @Override
+ public boolean equals(Object obj) {
+ if (obj instanceof Transition) {
+ Transition t = (Transition) obj;
+ return t.min == min && t.max == max && t.to == to;
+ } else return false;
+ }
+
+ /**
+ * Returns hash code. The hash code is based on the character interval (not
+ * the destination state).
+ *
+ * @return hash code
+ */
+ @Override
+ public int hashCode() {
+ return min * 2 + max * 3;
+ }
+
+ /**
+ * Clones this transition.
+ *
+ * @return clone with same character interval and destination state
+ */
+ @Override
+ public Transition clone() {
+ try {
+ return (Transition) super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ static void appendCharString(char c, StringBuilder b) {
+ if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.append(c);
+ else {
+ b.append("\\u");
+ String s = Integer.toHexString(c);
+ if (c < 0x10) b.append("000").append(s);
+ else if (c < 0x100) b.append("00").append(s);
+ else if (c < 0x1000) b.append("0").append(s);
+ else b.append(s);
+ }
+ }
+
+ /**
+ * Returns a string describing this state. Normally invoked via
+ * {@link Automaton#toString()}.
+ */
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ appendCharString(min, b);
+ if (min != max) {
+ b.append("-");
+ appendCharString(max, b);
+ }
+ b.append(" -> ").append(to.number);
+ return b.toString();
+ }
+
+ void appendDot(StringBuilder b) {
+ b.append(" -> ").append(to.number).append(" [label=\"");
+ appendCharString(min, b);
+ if (min != max) {
+ b.append("-");
+ appendCharString(max, b);
+ }
+ b.append("\"]\n");
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\Transition.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/automaton/TransitionComparator.java
===================================================================
--- src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 0)
+++ src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 0)
@@ -0,0 +1,80 @@
+/*
+ * dk.brics.automaton
+ *
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Comparator;
+
+/**
+ * Comparator for state {@link Transition}s that orders unicode char range
+ * transitions in lexicographic order.
+ *
+ *
+ * WARNING: The status of the Automaton feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.
+ */
+class TransitionComparator implements Comparator, Serializable {
+
+ static final long serialVersionUID = 10001;
+
+ boolean to_first;
+
+ TransitionComparator(boolean to_first) {
+ this.to_first = to_first;
+ }
+
+ /**
+ * Compares by (min, reverse max, to) or (to, min, reverse max).
+ */
+ public int compare(Transition t1, Transition t2) {
+ if (to_first) {
+ if (t1.to != t2.to) {
+ if (t1.to == null) return -1;
+ else if (t2.to == null) return 1;
+ else if (t1.to.number < t2.to.number) return -1;
+ else if (t1.to.number > t2.to.number) return 1;
+ }
+ }
+ if (t1.min < t2.min) return -1;
+ if (t1.min > t2.min) return 1;
+ if (t1.max > t2.max) return -1;
+ if (t1.max < t2.max) return 1;
+ if (!to_first) {
+ if (t1.to != t2.to) {
+ if (t1.to == null) return -1;
+ else if (t2.to == null) return 1;
+ else if (t1.to.number < t2.to.number) return -1;
+ else if (t1.to.number > t2.to.number) return 1;
+ }
+ }
+ return 0;
+ }
+}
Property changes on: src\java\org\apache\lucene\util\automaton\TransitionComparator.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/test/org/apache/lucene/search/TestAutomatonQuery.java
===================================================================
--- src/test/org/apache/lucene/search/TestAutomatonQuery.java (revision 0)
+++ src/test/org/apache/lucene/search/TestAutomatonQuery.java (revision 0)
@@ -0,0 +1,233 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.Version;
+
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+
+public class TestAutomatonQuery extends LuceneTestCase {
+ private IndexSearcher searcher;
+
+ private final String FN = "field";
+
+ public void setUp() throws Exception {
+ super.setUp();
+ RAMDirectory directory = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(directory,
+ new StandardAnalyzer(Version.LUCENE_CURRENT, Collections.emptySet()), true,
+ IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ Field titleField = new Field("title", "some title", Field.Store.NO,
+ Field.Index.ANALYZED);
+ Field field = new Field(FN, "this is document one 2345", Field.Store.NO,
+ Field.Index.ANALYZED);
+ Field footerField = new Field("footer", "a footer", Field.Store.NO,
+ Field.Index.ANALYZED);
+ doc.add(titleField);
+ doc.add(field);
+ doc.add(footerField);
+ writer.addDocument(doc);
+ field.setValue("some text from doc two, a short piece. 5678.91");
+ writer.addDocument(doc);
+ field.setValue("doc three has some different stuff: with numbers 1234 5678.9 and letter b");
+ writer.addDocument(doc);
+ writer.optimize();
+ writer.close();
+ searcher = new IndexSearcher(directory, true);
+ }
+
+ public void tearDown() throws Exception {
+ searcher.close();
+ super.tearDown();
+ }
+
+ private Term newTerm(String value) {
+ return new Term(FN, value);
+ }
+
+ private int automatonQueryNrHits(AutomatonQuery query) throws IOException {
+ return searcher.search(query, 5).totalHits;
+ }
+
+ private void assertAutomatonHits(int expected, Automaton automaton)
+ throws IOException {
+ AutomatonQuery query = new AutomatonQuery(newTerm("bogus"), automaton);
+
+ query.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
+ assertEquals(expected, automatonQueryNrHits(query));
+
+ query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
+ assertEquals(expected, automatonQueryNrHits(query));
+
+ query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
+ assertEquals(expected, automatonQueryNrHits(query));
+
+ query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
+ assertEquals(expected, automatonQueryNrHits(query));
+ }
+
+ /**
+ * Test some very simple automata.
+ */
+ public void testBasicAutomata() throws IOException {
+ assertAutomatonHits(0, BasicAutomata.makeEmpty());
+ assertAutomatonHits(0, BasicAutomata.makeEmptyString());
+ assertAutomatonHits(2, BasicAutomata.makeAnyChar());
+ assertAutomatonHits(3, BasicAutomata.makeAnyString());
+ assertAutomatonHits(2, BasicAutomata.makeString("doc"));
+ assertAutomatonHits(1, BasicAutomata.makeChar('a'));
+ assertAutomatonHits(2, BasicAutomata.makeCharRange('a', 'b'));
+ assertAutomatonHits(2, BasicAutomata.makeCharSet("ab"));
+ assertAutomatonHits(1, BasicAutomata.makeDecimalValue("5678.9"));
+ assertAutomatonHits(1, BasicAutomata.makeDecimalValue("2345"));
+ assertAutomatonHits(3, BasicAutomata.makeFractionDigits(3));
+ assertAutomatonHits(1, BasicAutomata.makeIntegerValue("1234"));
+ assertAutomatonHits(2, BasicAutomata.makeInterval(1233, 2346, 0));
+ assertAutomatonHits(1, BasicAutomata.makeInterval(0, 2000, 0));
+ assertAutomatonHits(2, BasicAutomata.makeMaxInteger("003000"));
+ assertAutomatonHits(1, BasicAutomata.makeMinInteger("002000"));
+ assertAutomatonHits(2, BasicAutomata.makeStringMatcher("ome"));
+ assertAutomatonHits(2, BasicAutomata.makeTotalDigits(5));
+ assertAutomatonHits(2, BasicOperations.union(BasicAutomata.makeChar('a'),
+ BasicAutomata.makeChar('b')));
+ assertAutomatonHits(0, BasicOperations.intersection(BasicAutomata
+ .makeChar('a'), BasicAutomata.makeChar('b')));
+ assertAutomatonHits(1, BasicOperations.minus(BasicAutomata
+ .makeMaxInteger("3000"), BasicAutomata.makeIntegerValue("1234")));
+ }
+
+ /**
+ * Test that a nondeterministic automaton works correctly. (It should will be
+ * determinized)
+ */
+ public void testNFA() throws IOException {
+ // accept this or three, the union is an NFA (two transitions for 't' from
+ // initial state)
+ Automaton nfa = BasicOperations.union(BasicAutomata.makeString("this"),
+ BasicAutomata.makeString("three"));
+ assertAutomatonHits(2, nfa);
+ }
+
+ public void testEquals() {
+ AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), BasicAutomata
+ .makeString("foobar"));
+ // reference to a1
+ AutomatonQuery a2 = a1;
+ // same as a1 (accepts the same language, same term)
+ AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"), BasicOperations
+ .concatenate(BasicAutomata.makeString("foo"), BasicAutomata
+ .makeString("bar")));
+ // different than a1 (same term, but different language)
+ AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"), BasicAutomata
+ .makeString("different"));
+ // different than a1 (different term, same language)
+ AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"), BasicAutomata
+ .makeString("foobar"));
+
+ assertEquals(a1, a2);
+ assertEquals(a1.hashCode(), a2.hashCode());
+
+ assertEquals(a1, a3);
+ assertEquals(a1.hashCode(), a3.hashCode());
+
+ assertEquals(a1.toString(), a3.toString());
+
+ // different class
+ AutomatonQuery w1 = new WildcardQuery(newTerm("foobar"));
+ // different class
+ AutomatonQuery w2 = new RegexpQuery(newTerm("foobar"));
+
+ assertFalse(a1.equals(w1));
+ assertFalse(a1.equals(w2));
+ assertFalse(w1.equals(w2));
+ assertFalse(a1.equals(a4));
+ assertFalse(a1.equals(a5));
+ assertFalse(a1.equals(null));
+ }
+
+ /**
+ * Test that rewriting to a single term works as expected, preserves
+ * MultiTermQuery semantics.
+ */
+ public void testRewriteSingleTerm() throws IOException {
+ AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"),
+ BasicAutomata.makeString("piece"));
+ assertTrue(aq.getEnum(searcher.getIndexReader()) instanceof SingleTermEnum);
+ assertTrue(aq.getTermsEnum(searcher.getIndexReader()) instanceof SingleTermsEnum);
+ assertEquals(1, automatonQueryNrHits(aq));
+ }
+
+ /**
+ * Test that rewriting to a prefix query works as expected, preserves
+ * MultiTermQuery semantics.
+ */
+ public void testRewritePrefix() throws IOException {
+ Automaton pfx = BasicAutomata.makeString("do");
+ pfx.expandSingleton(); // expand singleton representation for testing
+ Automaton prefixAutomaton = BasicOperations.concatenate(pfx, BasicAutomata
+ .makeAnyString());
+ AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"),
+ prefixAutomaton);
+ assertTrue(aq.getEnum(searcher.getIndexReader()) instanceof PrefixTermEnum);
+ assertTrue(aq.getTermsEnum(searcher.getIndexReader()) instanceof PrefixTermsEnum);
+ assertEquals(3, automatonQueryNrHits(aq));
+ }
+
+ /**
+ * Test that a badly-performing automaton that must visit all the terms does
+ * not use the smart enumeration, this will just waste cpu.
+ */
+ public void testLinearOptimization() throws IOException {
+ AutomatonQuery aq = new RegexpQuery(newTerm(".*ument"));
+ assertTrue(((AutomatonTermEnum) aq.getEnum(searcher.getIndexReader())).usesLinearMode());
+ assertTrue(((AutomatonTermsEnum) aq.getTermsEnum(searcher.getIndexReader())).usesLinearMode());
+ assertEquals(1, automatonQueryNrHits(aq));
+ }
+
+ /**
+ * Test that a badly-performing automaton that must visit all the terms does
+ * not use the smart enumeration, this will just waste cpu.
+ */
+ public void testEmptyOptimization() throws IOException {
+ AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"),
+ BasicAutomata.makeEmpty());
+ //not yet available: assertTrue(aq.getEnum(searcher.getIndexReader()) instanceof EmptyTermEnum);
+ assertTrue(aq.getTermsEnum(searcher.getIndexReader()) instanceof EmptyTermsEnum);
+ assertEquals(0, automatonQueryNrHits(aq));
+ }
+}
Property changes on: src\test\org\apache\lucene\search\TestAutomatonQuery.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java
===================================================================
--- src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java (revision 0)
+++ src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java (revision 0)
@@ -0,0 +1,177 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * Test the automaton query for several unicode corner cases,
+ * specifically enumerating strings/indexes containing supplementary characters,
+ * and the differences between UTF-8/UTF-32 and UTF-16 binary sort order.
+ */
+public class TestAutomatonQueryUnicode extends LuceneTestCase {
+ private IndexSearcher searcher;
+
+ private final String FN = "field";
+
+ public void setUp() throws Exception {
+ super.setUp();
+ RAMDirectory directory = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(directory, new KeywordAnalyzer(), true,
+ IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ Field titleField = new Field("title", "some title", Field.Store.NO,
+ Field.Index.ANALYZED);
+ Field field = new Field(FN, "", Field.Store.NO,
+ Field.Index.ANALYZED);
+ Field footerField = new Field("footer", "a footer", Field.Store.NO,
+ Field.Index.ANALYZED);
+ doc.add(titleField);
+ doc.add(field);
+ doc.add(footerField);
+ field.setValue("\uD866\uDF05abcdef");
+ writer.addDocument(doc);
+ field.setValue("\uD866\uDF06ghijkl");
+ writer.addDocument(doc);
+ field.setValue("\uFB94mnopqr"); // this sorts before the previous two in UTF-8/UTF-32, but after in UTF-16!!!
+ writer.addDocument(doc);
+ field.setValue("\uFB95stuvwx"); // this one too.
+ writer.addDocument(doc);
+ field.setValue("a\uFFFCbc");
+ writer.addDocument(doc);
+ field.setValue("a\uFFFDbc");
+ writer.addDocument(doc);
+ field.setValue("a\uFFFEbc");
+ writer.addDocument(doc);
+ field.setValue("a\uFB94bc");
+ writer.addDocument(doc);
+ field.setValue("bacadaba");
+ writer.addDocument(doc);
+ field.setValue("\uFFFD");
+ writer.addDocument(doc);
+ field.setValue("\uFFFD\uD866\uDF05");
+ writer.addDocument(doc);
+ field.setValue("\uFFFD\uFFFD");
+ writer.addDocument(doc);
+ writer.optimize();
+ writer.close();
+ searcher = new IndexSearcher(directory, true);
+ }
+
+ public void tearDown() throws Exception {
+ searcher.close();
+ super.tearDown();
+ }
+
+ private Term newTerm(String value) {
+ return new Term(FN, value);
+ }
+
+ private int automatonQueryNrHits(AutomatonQuery query) throws IOException {
+ return searcher.search(query, 5).totalHits;
+ }
+
+ private void assertAutomatonHits(int expected, Automaton automaton)
+ throws IOException {
+ AutomatonQuery query = new AutomatonQuery(newTerm("bogus"), automaton);
+
+ query.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
+ assertEquals(expected, automatonQueryNrHits(query));
+
+ query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
+ assertEquals(expected, automatonQueryNrHits(query));
+
+ query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
+ assertEquals(expected, automatonQueryNrHits(query));
+
+ query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
+ assertEquals(expected, automatonQueryNrHits(query));
+ }
+
+ /**
+ * Test that AutomatonQuery interacts with lucene's sort order correctly.
+ *
+ * This expression matches something either starting with the arabic presentation forms block,
+ * or a supplementary character.
+ */
+ public void testSortOrder() throws IOException {
+ Automaton a = new RegExp("((\uD866\uDF05)|\uFB94).*").toAutomaton();
+ assertAutomatonHits(2, a);
+ }
+
+ /**
+ * Test that AutomatonQuery properly seeks to supplementary characters.
+ * Transitions are modeled as UTF-16 code units, so without special handling
+ * by default it will try to seek to a lead surrogate with some DFAs
+ */
+ public void testSeekSurrogate() throws IOException {
+ Automaton a = new RegExp("\uD866[a\uDF05\uFB93][a-z]{0,5}[fl]").toAutomaton();
+ assertAutomatonHits(1, a);
+ }
+
+ /**
+ * Try seeking to an ending lead surrogate.
+ */
+ public void testSeekSurrogate2() throws IOException {
+ Automaton a = new RegExp("\uD866(\uDF06ghijkl)?").toAutomaton();
+ assertAutomatonHits(1, a);
+ }
+
+ /**
+ * Try seeking to an starting trail surrogate.
+ */
+ public void testSeekSurrogate3() throws IOException {
+ Automaton a = new RegExp("[\uDF06\uFB94]mnopqr").toAutomaton();
+ assertAutomatonHits(1, a);
+ }
+
+ /**
+ * Try seeking to an medial/final trail surrogate.
+ */
+ public void testSeekSurrogate4() throws IOException {
+ Automaton a = new RegExp("a[\uDF06\uFB94]bc").toAutomaton();
+ assertAutomatonHits(1, a);
+ }
+
+ /**
+ * Ensure the 'constant suffix' does not contain a leading trail surrogate.
+ */
+ public void testSurrogateSuffix() throws IOException {
+ Automaton a = new RegExp(".*[\uD865\uD866]\uDF06ghijkl").toAutomaton();
+ assertAutomatonHits(1, a);
+ }
+
+ /**
+ * Try when the constant suffix is only a leading trail surrogate.
+ * instead this must use an empty suffix.
+ */
+ public void testSurrogateSuffix2() throws IOException {
+ Automaton a = new RegExp(".*\uDF05").toAutomaton();
+ assertAutomatonHits(1, a);
+ }
+}
Property changes on: src\test\org\apache\lucene\search\TestAutomatonQueryUnicode.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/test/org/apache/lucene/search/TestNumericRangeQuery32.java
===================================================================
--- src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (revision 887534)
+++ src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (working copy)
@@ -444,18 +444,16 @@
lower, upper, true, true);
FilteredTermsEnum termEnum = q.getTermsEnum(searcher.getIndexReader());
int count = 0;
- if (!termEnum.empty()) {
- do {
- final TermRef t = termEnum.term();
- if (t != null) {
- final int val = NumericUtils.prefixCodedToInt(t.toString());
- assertTrue("value not in bounds " + val + " >= " + lower + " && "
- + val + " <= " + upper, val >= lower && val <= upper);
- count++;
- } else
- break;
- } while (termEnum.next() != null);
- }
+ while (termEnum.next() != null) {
+ final TermRef t = termEnum.term();
+ if (t != null) {
+ final int val = NumericUtils.prefixCodedToInt(t.toString());
+ assertTrue("value not in bounds " + val + " >= " + lower + " && "
+ + val + " <= " + upper, val >= lower && val <= upper);
+ count++;
+ } else
+ break;
+ }
assertNull(termEnum.next());
System.out.println("TermEnum on 'field4' for range [" + lower + "," + upper
+ "] contained " + count + " terms.");
Index: src/test/org/apache/lucene/search/TestRegexpQuery.java
===================================================================
--- src/test/org/apache/lucene/search/TestRegexpQuery.java (revision 0)
+++ src/test/org/apache/lucene/search/TestRegexpQuery.java (revision 0)
@@ -0,0 +1,124 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonProvider;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * Some simple regex tests, mostly converted from contrib's TestRegexQuery.
+ */
+public class TestRegexpQuery extends LuceneTestCase {
+ private IndexSearcher searcher;
+ private final String FN = "field";
+
+ public void setUp() throws Exception {
+ super.setUp();
+ RAMDirectory directory = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(),
+ true, IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ doc.add(new Field(FN,
+ "the quick brown fox jumps over the lazy ??? dog 493432 49344",
+ Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ writer.optimize();
+ writer.close();
+ searcher = new IndexSearcher(directory, true);
+ }
+
+ public void tearDown() throws Exception {
+ searcher.close();
+ super.tearDown();
+ }
+
+ private Term newTerm(String value) { return new Term(FN, value); }
+
+ private int regexQueryNrHits(String regex) throws IOException {
+ RegexpQuery query = new RegexpQuery( newTerm(regex));
+ return searcher.search(query, 5).totalHits;
+ }
+
+ public void testRegex1() throws IOException {
+ assertEquals(1, regexQueryNrHits("q.[aeiou]c.*"));
+ }
+
+ public void testRegex2() throws IOException {
+ assertEquals(0, regexQueryNrHits(".[aeiou]c.*"));
+ }
+
+ public void testRegex3() throws IOException {
+ assertEquals(0, regexQueryNrHits("q.[aeiou]c"));
+ }
+
+ public void testNumericRange() throws IOException {
+ assertEquals(1, regexQueryNrHits("<420000-600000>"));
+ assertEquals(0, regexQueryNrHits("<493433-600000>"));
+ }
+
+ public void testRegexComplement() throws IOException {
+ assertEquals(1, regexQueryNrHits("4934~[3]"));
+ // not the empty lang, i.e. match all docs
+ assertEquals(1, regexQueryNrHits("~#"));
+ }
+
+ public void testCustomProvider() throws IOException {
+ AutomatonProvider myProvider = new AutomatonProvider() {
+ // automaton that matches quick or brown
+ private Automaton quickBrownAutomaton = BasicOperations.union(
+ Arrays.asList(new Automaton[] {
+ BasicAutomata.makeString("quick"),
+ BasicAutomata.makeString("brown"),
+ BasicAutomata.makeString("bob")}));
+
+ public Automaton getAutomaton(String name) throws IOException {
+ if (name.equals("quickBrown"))
+ return quickBrownAutomaton;
+ else
+ return null;
+ }
+ };
+ RegexpQuery query = new RegexpQuery(newTerm(""), RegExp.ALL, myProvider);
+ assertEquals(1, searcher.search(query, 5).totalHits);
+ }
+
+ /**
+ * Test a corner case for backtracking:
+ * In this case the term dictionary has 493432 followed by 49344.
+ * When backtracking from 49343... to 4934, its necessary
+ * to test that 4934 itself is ok before trying to append more characters.
+ */
+ public void testBacktracking() throws IOException {
+ assertEquals(1, regexQueryNrHits("4934[314]"));
+ }
+}
+
Property changes on: src\test\org\apache\lucene\search\TestRegexpQuery.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/test/org/apache/lucene/search/TestRegexpRandom.java
===================================================================
--- src/test/org/apache/lucene/search/TestRegexpRandom.java (revision 0)
+++ src/test/org/apache/lucene/search/TestRegexpRandom.java (revision 0)
@@ -0,0 +1,144 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Random;
+
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Create an index with terms from 0000-9999.
+ * Generates random regexps according to simple patterns,
+ * and validates the correct number of hits are returned.
+ */
+public class TestRegexpRandom extends LuceneTestCase {
+ private Searcher searcher;
+ private Random random;
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ RAMDirectory dir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
+
+ Document doc = new Document();
+ Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
+ doc.add(field);
+
+ NumberFormat df = new DecimalFormat("0000");
+ for (int i = 0; i < 10000; i++) {
+ field.setValue(df.format(i));
+ writer.addDocument(doc);
+ }
+
+ writer.optimize();
+ writer.close();
+ searcher = new IndexSearcher(dir);
+ }
+
+ private char N() {
+ return (char) (0x30 + random.nextInt(10));
+ }
+
+ private String fillPattern(String wildcardPattern) {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < wildcardPattern.length(); i++) {
+ switch(wildcardPattern.charAt(i)) {
+ case 'N':
+ sb.append(N());
+ break;
+ default:
+ sb.append(wildcardPattern.charAt(i));
+ }
+ }
+ return sb.toString();
+ }
+
+ private void assertPatternHits(String pattern, int numHits) throws Exception {
+ Query wq = new RegexpQuery(new Term("field", fillPattern(pattern)));
+ TopDocs docs = searcher.search(wq, 25);
+ assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits);
+ }
+
+ @Override
+ protected void tearDown() throws Exception {
+ searcher.close();
+ super.tearDown();
+ }
+
+ public void testRegexps() throws Exception {
+ random = newRandom(System.nanoTime());
+ for (int i = 0; i < 100; i++) {
+ assertPatternHits("NNNN", 1);
+ assertPatternHits(".NNN", 10);
+ assertPatternHits("N.NN", 10);
+ assertPatternHits("NN.N", 10);
+ assertPatternHits("NNN.", 10);
+ }
+
+ for (int i = 0; i < 10; i++) {
+ assertPatternHits(".{1,2}NN", 100);
+ assertPatternHits("N.{1,2}N", 100);
+ assertPatternHits("NN.{1,2}", 100);
+ assertPatternHits(".{1,3}N", 1000);
+ assertPatternHits("N.{1,3}", 1000);
+ assertPatternHits(".{1,4}", 10000);
+
+ assertPatternHits("NNN[3-7]", 5);
+ assertPatternHits("NN[2-6][3-7]", 25);
+ assertPatternHits("N[1-5][2-6][3-7]", 125);
+ assertPatternHits("[0-4][3-7][4-8][5-9]", 625);
+ assertPatternHits("[3-7][2-6][0-4]N", 125);
+ assertPatternHits("[2-6][3-7]NN", 25);
+ assertPatternHits("[3-7]NNN", 5);
+
+ assertPatternHits("NNN.*", 10);
+ assertPatternHits("NN.*", 100);
+ assertPatternHits("N.*", 1000);
+ assertPatternHits(".*", 10000);
+
+ assertPatternHits(".*NNN", 10);
+ assertPatternHits(".*NN", 100);
+ assertPatternHits(".*N", 1000);
+
+ assertPatternHits("N.*NN", 10);
+ assertPatternHits("NN.*N", 10);
+
+ // combo of ? and * operators
+ assertPatternHits(".NN.*", 100);
+ assertPatternHits("N.N.*", 100);
+ assertPatternHits("NN..*", 100);
+ assertPatternHits(".N..*", 1000);
+ assertPatternHits("N...*", 1000);
+
+ assertPatternHits(".*NN.", 100);
+ assertPatternHits(".*N..", 1000);
+ assertPatternHits(".*...", 10000);
+ assertPatternHits(".*.N.", 1000);
+ assertPatternHits(".*..N", 1000);
+ }
+ }
+}
Property changes on: src\test\org\apache\lucene\search\TestRegexpRandom.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/test/org/apache/lucene/search/TestWildcard.java
===================================================================
--- src/test/org/apache/lucene/search/TestWildcard.java (revision 887534)
+++ src/test/org/apache/lucene/search/TestWildcard.java (working copy)
@@ -24,6 +24,7 @@
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
@@ -119,31 +120,6 @@
MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*"));
assertMatches(searcher, wq, 2);
-
- MultiTermQuery expected = new PrefixQuery(new Term("field", "prefix"));
- wq.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
- wq.setBoost(0.1F);
- expected.setRewriteMethod(wq.getRewriteMethod());
- expected.setBoost(wq.getBoost());
- assertEquals(searcher.rewrite(expected), searcher.rewrite(wq));
-
- wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
- wq.setBoost(0.2F);
- expected.setRewriteMethod(wq.getRewriteMethod());
- expected.setBoost(wq.getBoost());
- assertEquals(searcher.rewrite(expected), searcher.rewrite(wq));
-
- wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
- wq.setBoost(0.3F);
- expected.setRewriteMethod(wq.getRewriteMethod());
- expected.setBoost(wq.getBoost());
- assertEquals(searcher.rewrite(expected), searcher.rewrite(wq));
-
- wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
- wq.setBoost(0.4F);
- expected.setRewriteMethod(wq.getRewriteMethod());
- expected.setBoost(wq.getBoost());
- assertEquals(searcher.rewrite(expected), searcher.rewrite(wq));
}
/**
@@ -326,4 +302,57 @@
searcher.close();
}
+ @Deprecated
+ private static final class OldWildcardQuery extends MultiTermQuery {
+ final Term term;
+
+ OldWildcardQuery(Term term) {
+ this.term = term;
+ }
+
+ @Override
+ protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
+ return new WildcardTermEnum(reader, term);
+ }
+
+ @Override
+ public String toString(String field) {
+ return "OldWildcard(" + term.toString()+ ")";
+ }
+ }
+
+ @Deprecated
+ public void testDeprecatedTermEnum() throws Exception {
+ RAMDirectory indexStore = getIndexStore("body", new String[]
+ {"metal", "metals"});
+ IndexSearcher searcher = new IndexSearcher(indexStore, true);
+ Query query1 = new TermQuery(new Term("body", "metal"));
+ Query query2 = new OldWildcardQuery(new Term("body", "metal*"));
+ Query query3 = new OldWildcardQuery(new Term("body", "m*tal"));
+ Query query4 = new OldWildcardQuery(new Term("body", "m*tal*"));
+ Query query5 = new OldWildcardQuery(new Term("body", "m*tals"));
+
+ BooleanQuery query6 = new BooleanQuery();
+ query6.add(query5, BooleanClause.Occur.SHOULD);
+
+ BooleanQuery query7 = new BooleanQuery();
+ query7.add(query3, BooleanClause.Occur.SHOULD);
+ query7.add(query5, BooleanClause.Occur.SHOULD);
+
+ // Queries do not automatically lower-case search terms:
+ Query query8 = new OldWildcardQuery(new Term("body", "M*tal*"));
+
+ assertMatches(searcher, query1, 1);
+ assertMatches(searcher, query2, 2);
+ assertMatches(searcher, query3, 1);
+ assertMatches(searcher, query4, 2);
+ assertMatches(searcher, query5, 1);
+ assertMatches(searcher, query6, 1);
+ assertMatches(searcher, query7, 2);
+ assertMatches(searcher, query8, 0);
+ assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tall")), 0);
+ assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal")), 1);
+ assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal*")), 2);
+ }
+
}
Index: src/test/org/apache/lucene/search/TestWildcardRandom.java
===================================================================
--- src/test/org/apache/lucene/search/TestWildcardRandom.java (revision 0)
+++ src/test/org/apache/lucene/search/TestWildcardRandom.java (revision 0)
@@ -0,0 +1,136 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Random;
+
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Create an index with terms from 0000-9999.
+ * Generates random wildcards according to patterns,
+ * and validates the correct number of hits are returned.
+ */
+public class TestWildcardRandom extends LuceneTestCase {
+ private Searcher searcher;
+ private Random random;
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ RAMDirectory dir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
+
+ Document doc = new Document();
+ Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
+ doc.add(field);
+
+ NumberFormat df = new DecimalFormat("0000");
+ for (int i = 0; i < 10000; i++) {
+ field.setValue(df.format(i));
+ writer.addDocument(doc);
+ }
+
+ writer.optimize();
+ writer.close();
+ searcher = new IndexSearcher(dir);
+ }
+
+ private char N() {
+ return (char) (0x30 + random.nextInt(10));
+ }
+
+ private String fillPattern(String wildcardPattern) {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < wildcardPattern.length(); i++) {
+ switch(wildcardPattern.charAt(i)) {
+ case 'N':
+ sb.append(N());
+ break;
+ default:
+ sb.append(wildcardPattern.charAt(i));
+ }
+ }
+ return sb.toString();
+ }
+
+ private void assertPatternHits(String pattern, int numHits) throws Exception {
+ Query wq = new WildcardQuery(new Term("field", fillPattern(pattern)));
+ TopDocs docs = searcher.search(wq, 25);
+ assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits);
+ }
+
+ @Override
+ protected void tearDown() throws Exception {
+ searcher.close();
+ super.tearDown();
+ }
+
+ public void testWildcards() throws Exception {
+ random = newRandom(System.nanoTime());
+ for (int i = 0; i < 100; i++) {
+ assertPatternHits("NNNN", 1);
+ assertPatternHits("?NNN", 10);
+ assertPatternHits("N?NN", 10);
+ assertPatternHits("NN?N", 10);
+ assertPatternHits("NNN?", 10);
+ }
+
+ for (int i = 0; i < 10; i++) {
+ assertPatternHits("??NN", 100);
+ assertPatternHits("N??N", 100);
+ assertPatternHits("NN??", 100);
+ assertPatternHits("???N", 1000);
+ assertPatternHits("N???", 1000);
+ assertPatternHits("????", 10000);
+
+ assertPatternHits("NNN*", 10);
+ assertPatternHits("NN*", 100);
+ assertPatternHits("N*", 1000);
+ assertPatternHits("*", 10000);
+
+ assertPatternHits("*NNN", 10);
+ assertPatternHits("*NN", 100);
+ assertPatternHits("*N", 1000);
+
+ assertPatternHits("N*NN", 10);
+ assertPatternHits("NN*N", 10);
+
+ // combo of ? and * operators
+ assertPatternHits("?NN*", 100);
+ assertPatternHits("N?N*", 100);
+ assertPatternHits("NN?*", 100);
+ assertPatternHits("?N?*", 1000);
+ assertPatternHits("N??*", 1000);
+
+ assertPatternHits("*NN?", 100);
+ assertPatternHits("*N??", 1000);
+ assertPatternHits("*???", 10000);
+ assertPatternHits("*?N?", 1000);
+ assertPatternHits("*??N", 1000);
+ }
+ }
+}
Property changes on: src\test\org\apache\lucene\search\TestWildcardRandom.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native