Index: contrib/misc/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java =================================================================== --- contrib/misc/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java (revision 820510) +++ contrib/misc/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java (working copy) @@ -67,8 +67,8 @@ boolean inPhrase = false; int savedStart = 0, savedEnd = 0; - TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + TermAttribute termAtt = addAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); public boolean incrementToken() throws IOException { if (inPhrase) { Index: contrib/misc/src/java/org/apache/lucene/queryParser/precedence/FastCharStream.java =================================================================== --- contrib/misc/src/java/org/apache/lucene/queryParser/precedence/FastCharStream.java (revision 820510) +++ contrib/misc/src/java/org/apache/lucene/queryParser/precedence/FastCharStream.java (working copy) @@ -18,8 +18,6 @@ * limitations under the License. */ -import org.apache.lucene.queryParser.*; - import java.io.*; /** An efficient implementation of JavaCC's CharStream interface.
Note that Index: contrib/misc/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java =================================================================== --- contrib/misc/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java (revision 820510) +++ contrib/misc/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java (working copy) @@ -6,12 +6,16 @@ import java.text.DateFormat; import java.util.ArrayList; import java.util.Date; +import java.util.LinkedList; import java.util.List; import java.util.Locale; import java.util.Vector; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; @@ -27,64 +31,68 @@ import org.apache.lucene.util.Parameter; /** - * Experimental query parser variant designed to handle operator precedence - * in a more sensible fashion than QueryParser. There are still some - * open issues with this parser. The following tests are currently failing - * in TestPrecedenceQueryParser and are disabled to make this test pass: + * Experimental query parser variant designed to handle operator precedence in a + * more sensible fashion than QueryParser. There are still some open issues with + * this parser. The following tests are currently failing in + * TestPrecedenceQueryParser and are disabled to make this test pass: *
+) or a minus (-) sign, indicating
- * that the clause is required or prohibited respectively; or
- * +) or a minus (-) sign, indicating that
+ * the clause is required or prohibited respectively; or
+ * +/- prefix to require any of a set of
- * terms.
+ * +/- prefix to require any of a set of terms.
*
* Query ::= ( Clause )*
- * Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
+ * Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
*
- *
+ *
* * Examples of appropriately formatted queries can be found in the query syntax * documentation. *
- * + * */ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants { - private static final int CONJ_NONE = 0; - private static final int CONJ_AND = 1; - private static final int CONJ_OR = 2; + private static final int CONJ_NONE = 0; - private static final int MOD_NONE = 0; - private static final int MOD_NOT = 10; - private static final int MOD_REQ = 11; + private static final int CONJ_AND = 1; + private static final int CONJ_OR = 2; + + private static final int MOD_NONE = 0; + + private static final int MOD_NOT = 10; + + private static final int MOD_REQ = 11; + // make it possible to call setDefaultOperator() without accessing // the nested class: public static final Operator AND_OPERATOR = Operator.AND; + public static final Operator OR_OPERATOR = Operator.OR; /** The actual operator that parser uses to combine query terms */ @@ -93,23 +101,32 @@ boolean lowercaseExpandedTerms = true; Analyzer analyzer; + String field; + int phraseSlop = 0; + float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity; + int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength; + Locale locale = Locale.getDefault(); static final class Operator extends Parameter { private Operator(String name) { super(name); } + static final Operator OR = new Operator("OR"); + static final Operator AND = new Operator("AND"); } - /** Constructs a query parser. - * @param f the default field for query terms. - * @param a used to find terms in the query text. + /** + * Constructs a query parser. + * + * @param f the default field for query terms. + * @param a used to find terms in the query text. */ public PrecedenceQueryParser(String f, Analyzer a) { this(new FastCharStream(new StringReader(""))); @@ -117,9 +134,11 @@ field = f; } - /** Parses a query string, returning a {@link org.apache.lucene.search.Query}. - * @param expression the query string to be parsed. - * @throws ParseException if the parsing fails + /** + * Parses a query string, returning a {@link org.apache.lucene.search.Query}. + * + * @param expression the query string to be parsed. + * @throws ParseException if the parsing fails */ public Query parse(String expression) throws ParseException { // optimize empty query to be empty BooleanQuery @@ -131,16 +150,14 @@ try { Query query = Query(field); return (query != null) ? query : new BooleanQuery(); - } - catch (TokenMgrError tme) { + } catch (TokenMgrError tme) { throw new ParseException(tme.getMessage()); - } - catch (BooleanQuery.TooManyClauses tmc) { + } catch (BooleanQuery.TooManyClauses tmc) { throw new ParseException("Too many boolean clauses"); } } - /** + /** * @return Returns the analyzer. */ public Analyzer getAnalyzer() { @@ -154,23 +171,23 @@ return field; } - /** + /** * Get the minimal similarity for fuzzy queries. */ public float getFuzzyMinSim() { - return fuzzyMinSim; + return fuzzyMinSim; } /** - * Set the minimum similarity for fuzzy queries. - * Default is 0.5f. + * Set the minimum similarity for fuzzy queries. Default is 0.5f. */ public void setFuzzyMinSim(float fuzzyMinSim) { - this.fuzzyMinSim = fuzzyMinSim; + this.fuzzyMinSim = fuzzyMinSim; } - /** - * Get the prefix length for fuzzy queries. + /** + * Get the prefix length for fuzzy queries. + * * @return Returns the fuzzyPrefixLength. */ public int getFuzzyPrefixLength() { @@ -179,6 +196,7 @@ /** * Set the prefix length for fuzzy queries. Default is 0. + * * @param fuzzyPrefixLength The fuzzyPrefixLength to set. */ public void setFuzzyPrefixLength(int fuzzyPrefixLength) { @@ -186,8 +204,8 @@ } /** - * Sets the default slop for phrases. If zero, then exact phrase matches - * are required. Default value is zero. + * Sets the default slop for phrases. If zero, then exact phrase matches are + * required. Default value is zero. */ public void setPhraseSlop(int phraseSlop) { this.phraseSlop = phraseSlop; @@ -201,28 +219,29 @@ } /** - * Sets the boolean operator of the QueryParser. - * In default mode (OR_OPERATOR) terms without any modifiers
- * are considered optional: for example capital of Hungary is equal to
+ * Sets the boolean operator of the QueryParser. In default mode (
+ * OR_OPERATOR) terms without any modifiers are considered
+ * optional: for example capital of Hungary is equal to
* capital OR of OR Hungary.AND_OPERATOR mode terms are considered to be in conjuction: the
- * above mentioned query is parsed as capital AND of AND Hungary
+ * In AND_OPERATOR mode terms are considered to be in conjuction:
+ * the above mentioned query is parsed as
+ * capital AND of AND Hungary
*/
public void setDefaultOperator(Operator op) {
this.operator = op;
}
/**
- * Gets implicit operator setting, which will be either AND_OPERATOR
- * or OR_OPERATOR.
+ * Gets implicit operator setting, which will be either AND_OPERATOR or
+ * OR_OPERATOR.
*/
public Operator getDefaultOperator() {
return operator;
}
/**
- * Whether terms of wildcard, prefix, fuzzy and range queries are to be automatically
- * lower-cased or not. Default is true.
+ * Whether terms of wildcard, prefix, fuzzy and range queries are to be
+ * automatically lower-cased or not. Default is true.
*/
public void setLowercaseExpandedTerms(boolean lowercaseExpandedTerms) {
this.lowercaseExpandedTerms = lowercaseExpandedTerms;
@@ -262,17 +281,19 @@
// If this term is introduced by AND, make the preceding term required,
// unless it's already prohibited
if (clauses.size() > 0 && conj == CONJ_AND) {
- BooleanClause c = (BooleanClause) clauses.get(clauses.size()-1);
+ BooleanClause c = (BooleanClause) clauses.get(clauses.size() - 1);
if (!c.isProhibited())
c.setOccur(BooleanClause.Occur.MUST);
}
if (clauses.size() > 0 && operator == AND_OPERATOR && conj == CONJ_OR) {
// If this term is introduced by OR, make the preceding term optional,
- // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b)
- // notice if the input is a OR b, first term is parsed as required; without
+ // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR
+ // b)
+ // notice if the input is a OR b, first term is parsed as required;
+ // without
// this modification a OR b would parsed as +a OR b
- BooleanClause c = (BooleanClause) clauses.get(clauses.size()-1);
+ BooleanClause c = (BooleanClause) clauses.get(clauses.size() - 1);
if (!c.isProhibited())
c.setOccur(BooleanClause.Occur.SHOULD);
}
@@ -294,7 +315,7 @@
// We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED
// if not PROHIBITED and not introduced by OR
prohibited = (modifier == MOD_NOT);
- required = (!prohibited && conj != CONJ_OR);
+ required = (!prohibited && conj != CONJ_OR);
}
if (required && !prohibited)
clauses.add(new BooleanClause(q, BooleanClause.Occur.MUST));
@@ -303,100 +324,178 @@
else if (!required && prohibited)
clauses.add(new BooleanClause(q, BooleanClause.Occur.MUST_NOT));
else
- throw new RuntimeException("Clause cannot be both required and prohibited");
+ throw new RuntimeException(
+ "Clause cannot be both required and prohibited");
}
/**
* @exception ParseException throw in overridden method to disallow
*/
- protected Query getFieldQuery(String field, String queryText) throws ParseException {
+ protected Query getFieldQuery(String field, String queryText)
+ throws ParseException {
// Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count
- TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
- List list = new ArrayList();
- final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
- org.apache.lucene.analysis.Token nextToken;
+ TokenStream source = analyzer.tokenStream(field,
+ new StringReader(queryText));
+ try {
+ source = analyzer.reusableTokenStream(field, new StringReader(queryText));
+ source.reset();
+ } catch (IOException e) {
+ source = analyzer.tokenStream(field, new StringReader(queryText));
+ }
+
+ CachingTokenFilter buffer = new CachingTokenFilter(source);
+ TermAttribute termAtt = null;
+ PositionIncrementAttribute posIncrAtt = null;
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
+ int numTokens = 0;
- while (true) {
- try {
- nextToken = source.next(reusableToken);
+ boolean success = false;
+ try {
+ buffer.reset();
+ success = true;
+ } catch (IOException e) {
+ // success==false if we hit an exception
+ }
+ if (success) {
+ if (buffer.hasAttribute(TermAttribute.class)) {
+ termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
}
- catch (IOException e) {
- nextToken = null;
+ if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
+ posIncrAtt = (PositionIncrementAttribute) buffer
+ .getAttribute(PositionIncrementAttribute.class);
}
- if (nextToken == null)
- break;
- list.add(nextToken.clone());
- if (nextToken.getPositionIncrement() == 1)
- positionCount++;
- else
- severalTokensAtSamePosition = true;
}
+
+ boolean hasMoreTokens = false;
+ if (termAtt != null) {
+ try {
+ hasMoreTokens = buffer.incrementToken();
+ while (hasMoreTokens) {
+ numTokens++;
+ int positionIncrement = (posIncrAtt != null) ? posIncrAtt
+ .getPositionIncrement() : 1;
+
+ if (positionIncrement == 1)
+ positionCount++;
+ else
+ severalTokensAtSamePosition = true;
+
+ hasMoreTokens = buffer.incrementToken();
+
+ }
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+
try {
+ // rewind the buffer stream
+ buffer.reset();
+
+ // close original stream - all tokens buffered
source.close();
- }
- catch (IOException e) {
+ } catch (IOException e) {
// ignore
}
- if (list.size() == 0)
+ if (numTokens == 0)
return null;
- else if (list.size() == 1) {
- nextToken = (org.apache.lucene.analysis.Token) list.get(0);
- return new TermQuery(new Term(field, nextToken.term()));
+ else if (numTokens == 1) {
+
+ try {
+ buffer.incrementToken();
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+
+ return new TermQuery(new Term(field, termAtt.term()));
+
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
BooleanQuery q = new BooleanQuery();
- for (int i = 0; i < list.size(); i++) {
- nextToken = (org.apache.lucene.analysis.Token) list.get(i);
- TermQuery currentQuery = new TermQuery(
- new Term(field, nextToken.term()));
- q.add(currentQuery, BooleanClause.Occur.SHOULD);
+
+ try {
+ buffer.incrementToken();
+
+ do {
+
+ TermQuery currentQuery = new TermQuery(new Term(field, termAtt
+ .term()));
+ q.add(currentQuery, BooleanClause.Occur.SHOULD);
+
+ } while (buffer.incrementToken());
+
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
}
+
return q;
- }
- else {
+
+ } else {
// phrase query:
MultiPhraseQuery mpq = new MultiPhraseQuery();
- List multiTerms = new ArrayList();
- for (int i = 0; i < list.size(); i++) {
- nextToken = (org.apache.lucene.analysis.Token) list.get(i);
- if (nextToken.getPositionIncrement() == 1 && multiTerms.size() > 0) {
- mpq.add((Term[])multiTerms.toArray(new Term[0]));
- multiTerms.clear();
- }
- multiTerms.add(new Term(field, nextToken.term()));
+ LinkedList- * Depending on settings, prefix term may be lower-cased - * automatically. It will not go through the default Analyzer, - * however, since normal Analyzers are unlikely to work properly - * with wildcard templates. + * Depending on settings, prefix term may be lower-cased automatically. It + * will not go through the default Analyzer, however, since normal Analyzers + * are unlikely to work properly with wildcard templates. *
* Can be overridden by extending classes, to provide custom handling for * wildcard queries, which may be necessary due to missing analyzer calls. - * + * * @param field Name of the field query will use. - * @param termStr Term token that contains one or more wild card - * characters (? or *), but is not simple prefix term - * + * @param termStr Term token that contains one or more wild card characters (? + * or *), but is not simple prefix term + * * @return Resulting {@link Query} built for the term * @exception ParseException throw in overridden method to disallow */ - protected Query getWildcardQuery(String field, String termStr) throws ParseException - { + protected Query getWildcardQuery(String field, String termStr) + throws ParseException { if (lowercaseExpandedTerms) { termStr = termStr.toLowerCase(); } @@ -551,30 +633,28 @@ } /** - * Factory method for generating a query (similar to - * {@link #getWildcardQuery}). Called when parser parses an input term - * token that uses prefix notation; that is, contains a single '*' wildcard - * character as its last character. Since this is a special case - * of generic wildcard term, and such a query can be optimized easily, - * this usually results in a different query object. + * Factory method for generating a query (similar to {@link #getWildcardQuery} + * ). Called when parser parses an input term token that uses prefix notation; + * that is, contains a single '*' wildcard character as its last character. + * Since this is a special case of generic wildcard term, and such a query can + * be optimized easily, this usually results in a different query object. *
- * Depending on settings, a prefix term may be lower-cased - * automatically. It will not go through the default Analyzer, - * however, since normal Analyzers are unlikely to work properly - * with wildcard templates. + * Depending on settings, a prefix term may be lower-cased automatically. It + * will not go through the default Analyzer, however, since normal Analyzers + * are unlikely to work properly with wildcard templates. *
- * Can be overridden by extending classes, to provide custom handling for
- * wild card queries, which may be necessary due to missing analyzer calls.
- *
+ * Can be overridden by extending classes, to provide custom handling for wild
+ * card queries, which may be necessary due to missing analyzer calls.
+ *
* @param field Name of the field query will use.
* @param termStr Term token to use for building term for the query
- * (without trailing '*' character!)
- *
+ * (without trailing '*' character!)
+ *
* @return Resulting {@link Query} built for the term
* @exception ParseException throw in overridden method to disallow
*/
- protected Query getPrefixQuery(String field, String termStr) throws ParseException
- {
+ protected Query getPrefixQuery(String field, String termStr)
+ throws ParseException {
if (lowercaseExpandedTerms) {
termStr = termStr.toLowerCase();
}
@@ -582,19 +662,19 @@
return new PrefixQuery(t);
}
- /**
- * Factory method for generating a query (similar to
- * {@link #getWildcardQuery}). Called when parser parses
- * an input term token that has the fuzzy suffix (~) appended.
- *
+ /**
+ * Factory method for generating a query (similar to {@link #getWildcardQuery}
+ * ). Called when parser parses an input term token that has the fuzzy suffix
+ * (~) appended.
+ *
* @param field Name of the field query will use.
* @param termStr Term token to use for building term for the query
- *
+ *
* @return Resulting {@link Query} built for the term
* @exception ParseException throw in overridden method to disallow
*/
- protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException
- {
+ protected Query getFuzzyQuery(String field, String termStr,
+ float minSimilarity) throws ParseException {
if (lowercaseExpandedTerms) {
termStr = termStr.toLowerCase();
}
@@ -603,33 +683,33 @@
}
/**
- * Returns a String where the escape char has been
- * removed, or kept only once if there was a double escape.
+ * Returns a String where the escape char has been removed, or kept only once
+ * if there was a double escape.
*/
private String discardEscapeChar(String input) {
char[] caSource = input.toCharArray();
char[] caDest = new char[caSource.length];
int j = 0;
for (int i = 0; i < caSource.length; i++) {
- if ((caSource[i] != '\\') || (i > 0 && caSource[i-1] == '\\')) {
- caDest[j++]=caSource[i];
+ if ((caSource[i] != '\\') || (i > 0 && caSource[i - 1] == '\\')) {
+ caDest[j++] = caSource[i];
}
}
return new String(caDest, 0, j);
}
/**
- * Returns a String where those characters that QueryParser
- * expects to be escaped are escaped by a preceding \.
+ * Returns a String where those characters that QueryParser expects to be
+ * escaped are escaped by a preceding \.
*/
public static String escape(String s) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
// NOTE: keep this in sync with _ESCAPED_CHAR below!
- if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':'
- || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~'
- || c == '*' || c == '?') {
+ if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')'
+ || c == ':' || c == '^' || c == '[' || c == ']' || c == '\"'
+ || c == '{' || c == '}' || c == '~' || c == '*' || c == '?') {
sb.append('\\');
}
sb.append(c);
@@ -638,492 +718,540 @@
}
/**
- * Command line tool to test QueryParser, using {@link org.apache.lucene.analysis.SimpleAnalyzer}.
- * Usage:
+ * Command line tool to test QueryParser, using
+ * {@link org.apache.lucene.analysis.SimpleAnalyzer}. Usage:
* java org.apache.lucene.queryParser.QueryParser <input>
*/
public static void main(String[] args) throws Exception {
if (args.length == 0) {
- System.out.println("Usage: java org.apache.lucene.queryParser.QueryParser ");
+ System.out
+ .println("Usage: java org.apache.lucene.queryParser.QueryParser ");
System.exit(0);
}
PrecedenceQueryParser qp = new PrecedenceQueryParser("field",
- new org.apache.lucene.analysis.SimpleAnalyzer());
+ new org.apache.lucene.analysis.SimpleAnalyzer());
Query q = qp.parse(args[0]);
System.out.println(q.toString("field"));
}
-// * Query ::= ( Clause )*
-// * Clause ::= ["+", "-"] [