Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestMultiAnalyzer.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestMultiAnalyzer.java (revision 0)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestMultiAnalyzer.java (revision 0)
@@ -0,0 +1,215 @@
+package org.apache.lucene.queryparser.spans;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.spans.SpanQueryParser;
+
+/**
+ * Test SpanQueryParser's ability to deal with Analyzers that return more
+ * than one token per position or that return tokens with a position
+ * increment > 1.
+ *
+ * Copied nearly verbatim from TestMultiAnalyzer for classic QueryParser!!!
+ *
+ */
+public class TestMultiAnalyzer extends BaseTokenStreamTestCase{
+ private static int multiToken = 0;
+
+ public void testMultiAnalyzer() throws ParseException {
+
+ SpanQueryParser qp = new SpanQueryParser(TEST_VERSION_CURRENT, "", new MultiAnalyzer());
+
+ // trivial, no multiple tokens:
+ assertEquals("foo", qp.parse("foo").toString());
+ assertEquals("foo", qp.parse("\"foo\"").toString());
+ assertEquals("foo foobar", qp.parse("foo foobar").toString());
+ assertEquals("spanNear([foo, foobar], 0, true)", qp.parse("\"foo foobar\"").toString());
+ assertEquals("spanNear([foo, foobar, blah], 0, true)", qp.parse("\"foo foobar blah\"").toString());
+
+ // two tokens at the same position:
+ assertEquals("spanOr([multi, multi2]) foo", qp.parse("multi foo").toString());
+ assertEquals("foo spanOr([multi, multi2])", qp.parse("foo multi").toString());
+ assertEquals("spanOr([multi, multi2]) spanOr([multi, multi2])", qp.parse("multi multi").toString());
+ assertEquals("+(foo spanOr([multi, multi2])) +(bar spanOr([multi, multi2]))",
+ qp.parse("+(foo multi) +(bar multi)").toString());
+ assertEquals("+(foo spanOr([multi, multi2])) spanNear([field:bar, spanOr([field:multi, field:multi2])], 0, true)",
+ qp.parse("+(foo multi) field:\"bar multi\"").toString());
+
+ // phrases:
+ assertEquals("spanNear([spanOr([multi, multi2]), foo], 0, true)", qp.parse("\"multi foo\"").toString());
+ assertEquals("spanNear([foo, spanOr([multi, multi2])], 0, true)", qp.parse("\"foo multi\"").toString());
+ assertEquals("spanNear([foo, spanOr([multi, multi2]), foobar, spanOr([multi, multi2])], 0, true)",
+ qp.parse("\"foo multi foobar multi\"").toString());
+
+ // fields:
+ assertEquals("spanOr([field:multi, field:multi2]) field:foo", qp.parse("field:multi field:foo").toString());
+ assertEquals("spanNear([spanOr([field:multi, field:multi2]), field:foo], 0, true)", qp.parse("field:\"multi foo\"").toString());
+
+ // three tokens at one position:
+ assertEquals("spanOr([triplemulti, multi3, multi2])", qp.parse("triplemulti").toString());
+ assertEquals("foo spanOr([triplemulti, multi3, multi2]) foobar",
+ qp.parse("foo triplemulti foobar").toString());
+
+ // phrase with non-default slop:
+ assertEquals("spanNear([spanOr([multi, multi2]), foo], 10, false)", qp.parse("\"multi foo\"~10").toString());
+
+ // phrase with non-default boost:
+ assertEquals("spanNear([spanOr([multi, multi2]), foo], 0, true)^2.0", qp.parse("\"multi foo\"^2").toString());
+
+ // phrase after changing default slop
+ qp.setPhraseSlop(99);
+ assertEquals("spanNear([spanOr([multi, multi2]), foo], 99, false) bar",
+ qp.parse("\"multi foo\" bar").toString());
+ assertEquals("spanNear([spanOr([multi, multi2]), foo], 99, false) spanNear([foo, bar], 2, false)",
+ qp.parse("\"multi foo\" \"foo bar\"~2").toString());
+ qp.setPhraseSlop(0);
+
+ }
+
+
+ public void testPosIncrementAnalyzer() throws ParseException {
+ SpanQueryParser qp = new SpanQueryParser(TEST_VERSION_CURRENT,"", new PosIncrementAnalyzer());
+ assertEquals("quick brown", qp.parse("the quick brown").toString());
+ assertEquals("quick brown fox", qp.parse("the quick brown fox").toString());
+ }
+
+ /**
+ * Expands "multi" to "multi" and "multi2", both at the same position,
+ * and expands "triplemulti" to "triplemulti", "multi3", and "multi2".
+ */
+ private class MultiAnalyzer extends Analyzer {
+
+ @Override
+ public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer result = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
+ return new TokenStreamComponents(result, new TestFilter(result));
+ }
+ }
+
+ private final class TestFilter extends TokenFilter {
+
+ private String prevType;
+ private int prevStartOffset;
+ private int prevEndOffset;
+
+ private final CharTermAttribute termAtt;
+ private final PositionIncrementAttribute posIncrAtt;
+ private final OffsetAttribute offsetAtt;
+ private final TypeAttribute typeAtt;
+
+ public TestFilter(TokenStream in) {
+ super(in);
+ termAtt = addAttribute(CharTermAttribute.class);
+ posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = addAttribute(OffsetAttribute.class);
+ typeAtt = addAttribute(TypeAttribute.class);
+ }
+
+ @Override
+ public final boolean incrementToken() throws java.io.IOException {
+ if (multiToken > 0) {
+ termAtt.setEmpty().append("multi"+(multiToken+1));
+ offsetAtt.setOffset(prevStartOffset, prevEndOffset);
+ typeAtt.setType(prevType);
+ posIncrAtt.setPositionIncrement(0);
+ multiToken--;
+ return true;
+ } else {
+ boolean next = input.incrementToken();
+ if (!next) {
+ return false;
+ }
+ prevType = typeAtt.type();
+ prevStartOffset = offsetAtt.startOffset();
+ prevEndOffset = offsetAtt.endOffset();
+ String text = termAtt.toString();
+ if (text.equals("triplemulti")) {
+ multiToken = 2;
+ return true;
+ } else if (text.equals("multi")) {
+ multiToken = 1;
+ return true;
+ } else {
+ return true;
+ }
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ this.prevType = null;
+ this.prevStartOffset = 0;
+ this.prevEndOffset = 0;
+ }
+ }
+
+ /**
+ * Analyzes "the quick brown" as: quick(incr=2) brown(incr=1).
+ * Does not work correctly for input other than "the quick brown ...".
+ */
+ private class PosIncrementAnalyzer extends Analyzer {
+
+ @Override
+ public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer result = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
+ return new TokenStreamComponents(result, new TestPosIncrementFilter(result));
+ }
+ }
+
+ private final class TestPosIncrementFilter extends TokenFilter {
+
+ CharTermAttribute termAtt;
+ PositionIncrementAttribute posIncrAtt;
+
+ public TestPosIncrementFilter(TokenStream in) {
+ super(in);
+ termAtt = addAttribute(CharTermAttribute.class);
+ posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ }
+
+ @Override
+ public final boolean incrementToken () throws java.io.IOException {
+ while(input.incrementToken()) {
+ if (termAtt.toString().equals("the")) {
+ // stopword, do nothing
+ } else if (termAtt.toString().equals("quick")) {
+ posIncrAtt.setPositionIncrement(2);
+ return true;
+ } else {
+ posIncrAtt.setPositionIncrement(1);
+ return true;
+ }
+ }
+ return false;
+ }
+ }
+
+}
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQueryParserLexer.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQueryParserLexer.java (revision 0)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQueryParserLexer.java (revision 0)
@@ -0,0 +1,801 @@
+package org.apache.lucene.queryparser.spans;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import static org.junit.Assert.*;
+
+import java.util.List;
+
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.spans.SpanQueryLexer;
+import org.apache.lucene.queryparser.spans.SpanQueryParserBase;
+import org.apache.lucene.queryparser.spans.tokens.SQPBooleanOpToken;
+import org.apache.lucene.queryparser.spans.tokens.SQPBoostableToken;
+import org.apache.lucene.queryparser.spans.tokens.SQPClause.TYPE;
+import org.apache.lucene.queryparser.spans.tokens.SQPField;
+import org.apache.lucene.queryparser.spans.tokens.SQPNearClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPNotNearClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPOrClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPRangeTerm;
+import org.apache.lucene.queryparser.spans.tokens.SQPRegexTerm;
+import org.apache.lucene.queryparser.spans.tokens.SQPTerm;
+import org.apache.lucene.queryparser.spans.tokens.SQPToken;
+
+import org.junit.Test;
+
+public class TestSpanQueryParserLexer {
+ SpanQueryLexer lexer = new SpanQueryLexer();
+
+ @Test
+ public void testFields() throws ParseException{
+ executeSingleTokenTest(
+ "the quick f1: brown fox",
+ 2,
+ new SQPField("f1")
+ );
+
+ //no space
+ executeSingleTokenTest(
+ "the quick f1:brown fox",
+ 2,
+ new SQPField("f1")
+ );
+
+ boolean ex = false;
+ try{
+ //non-escaped colon
+
+ executeSingleTokenTest(
+ "the quick f1:f2:brown fox",
+ 2,
+ new SQPField("f1")
+ );
+ } catch (ParseException e){
+ ex = true;
+ }
+ assertTrue(ex);
+ //escaped colon
+ executeSingleTokenTest(
+ "the quick f1\\:f2:brown fox",
+ 2,
+ new SQPField("f1:f2")
+ );
+
+ //escaped colon
+ executeSingleTokenTest(
+ "the quick f1\\:f2:brown fox",
+ 3,
+ new SQPTerm("brown")
+ );
+ executeSingleTokenTest(
+ "the quick f1\\ f2: brown fox",
+ 2,
+ new SQPField("f1 f2")
+ );
+
+ //fields should not be tokenized within a regex
+ executeSingleTokenTest(
+ "the quick /f1: brown/ fox",
+ 2,
+ new SQPRegexTerm("f1: brown")
+ );
+
+ //fields are tokenized within parens
+ executeSingleTokenTest(
+ "the quick (f1: brown fox)",
+ 3,
+ new SQPField("f1")
+ );
+
+ ex = false;
+ try{
+ executeSingleTokenTest(
+ "the quick \"f1: brown fox\"",
+ 3,
+ null
+ );
+ } catch (ParseException e){
+ ex = true;
+ }
+ assertTrue(ex);
+ ex = false;
+ try{
+ //fields are tokenized within brackets
+ executeSingleTokenTest(
+ "the quick [f1: brown fox]",
+ 3,
+ new SQPField("f1")
+ );
+ } catch (ParseException e){
+ ex = true;
+ }
+ assertTrue(ex);
+
+ }
+
+ @Test
+ public void testRegexes() throws ParseException{
+ executeSingleTokenTest(
+ "the quick [brown (/rabb.?t/ /f?x/)]",
+ 5,
+ new SQPRegexTerm("rabb.?t")
+ );
+
+ executeSingleTokenTest(
+ "the quick [brown (ab/rabb.?t/cd /f?x/)]",
+ 6,
+ new SQPRegexTerm("rabb.?t")
+ );
+
+ //test regex unescape
+ executeSingleTokenTest(
+ "the quick [brown (/ra\\wb\\db\\/t/ /f?x/)]",
+ 5,
+ new SQPRegexTerm("ra\\wb\\db/t")
+ );
+
+ //test operators within regex
+ executeSingleTokenTest(
+ "the quick [brown (/(?i)a(b)+[c-e]*(f|g){0,3}/ /f?x/)]",
+ 5,
+ new SQPRegexTerm("(?i)a(b)+[c-e]*(f|g){0,3}")
+ );
+
+ }
+
+ @Test
+ public void testOr() throws ParseException{
+ SQPOrClause truth = new SQPOrClause(2,5);
+ truth.setMinimumNumberShouldMatch(SQPOrClause.DEFAULT_MINIMUM_NUMBER_SHOULD_MATCH);
+
+ executeSingleTokenTest(
+ "the quick (brown fox) jumped",
+ 2,
+ truth
+ );
+
+ truth.setMinimumNumberShouldMatch(23);
+ executeSingleTokenTest(
+ "the quick (brown fox)~23 jumped",
+ 2,
+ truth
+ );
+
+ truth.setMinimumNumberShouldMatch(2);
+ executeSingleTokenTest(
+ "the quick (brown fox)~ jumped",
+ 2,
+ truth
+ );
+
+ boolean ex = false;
+ try{
+ executeSingleTokenTest(
+ "the [quick (brown fox)~23 jumped]",
+ 23,
+ truth
+ );
+ } catch (ParseException e){
+ ex = true;
+ }
+ assertTrue(ex);
+
+ ex = false;
+ try{
+ executeSingleTokenTest(
+ "the [quick (brown fox)~ jumped]",
+ 3,
+ truth
+ );
+ } catch (ParseException e){
+ ex = true;
+ }
+ assertTrue(ex);
+
+ try{
+ executeSingleTokenTest(
+ "the \"quick (brown fox)~23 jumped\"",
+ 23,
+ truth
+ );
+ } catch (ParseException e){
+ ex = true;
+ }
+ assertTrue(ex);
+
+ ex = false;
+ try{
+ executeSingleTokenTest(
+ "the \"quick (brown fox)~ jumped\"",
+ 3,
+ truth
+ );
+ } catch (ParseException e){
+ ex = true;
+ }
+ assertTrue(ex);
+ }
+
+ @Test
+ public void testNear() throws ParseException{
+
+ SQPNearClause truth = new SQPNearClause(3, 5, TYPE.QUOTE, false,
+ SQPNearClause.UNSPECIFIED_IN_ORDER,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+ executeSingleTokenTest(
+ "the quick \"brown fox\" jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.QUOTE, true,
+ false,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+ executeSingleTokenTest(
+ "the quick \"brown fox\"~ jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.QUOTE, true,
+ true,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+ executeSingleTokenTest(
+ "the quick \"brown fox\"~> jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.QUOTE, true,
+ false,
+ 3);
+ executeSingleTokenTest(
+ "the quick \"brown fox\"~3 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.QUOTE, true,
+ true,
+ 3);
+ executeSingleTokenTest(
+ "the quick \"brown fox\"~>3 jumped",
+ 2,
+ truth
+ );
+
+ //now try with boosts
+ truth = new SQPNearClause(3, 5, TYPE.QUOTE, false,
+ SQPNearClause.UNSPECIFIED_IN_ORDER,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+ truth.setBoost(new Float(2.5));
+
+ executeSingleTokenTest(
+ "the quick \"brown fox\"^2.5 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.QUOTE, true,
+ false,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+ truth.setBoost(new Float(2.5));
+
+ executeSingleTokenTest(
+ "the quick \"brown fox\"~^2.5 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.QUOTE, true,
+ true,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+ truth.setBoost(new Float(2.5));
+ executeSingleTokenTest(
+ "the quick \"brown fox\"~>^2.5 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.QUOTE, true,
+ false,
+ 3);
+ truth.setBoost(new Float(2.5));
+
+ executeSingleTokenTest(
+ "the quick \"brown fox\"~3^2.5 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.QUOTE, true,
+ true,
+ 3);
+ truth.setBoost(new Float(2.5));
+
+ executeSingleTokenTest(
+ "the quick \"brown fox\"~>3^2.5 jumped",
+ 2,
+ truth
+ );
+
+ //now test brackets
+ truth = new SQPNearClause(3, 5, TYPE.BRACKET, false,
+ SQPNearClause.UNSPECIFIED_IN_ORDER,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+
+
+ executeSingleTokenTest(
+ "the quick [brown fox] jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.BRACKET, true,
+ false,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+ executeSingleTokenTest(
+ "the quick [brown fox]~ jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.BRACKET, true,
+ true,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+
+ executeSingleTokenTest(
+ "the quick [brown fox]~> jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.BRACKET, true,
+ false,
+ 3);
+
+ executeSingleTokenTest(
+ "the quick [brown fox]~3 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.BRACKET, true,
+ true,
+ 3);
+
+ executeSingleTokenTest(
+ "the quick [brown fox]~>3 jumped",
+ 2,
+ truth
+ );
+
+ //now brackets with boosts
+ truth = new SQPNearClause(3, 5, TYPE.BRACKET, false,
+ SQPNearClause.UNSPECIFIED_IN_ORDER,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+ truth.setBoost(new Float(2.5));
+
+ executeSingleTokenTest(
+ "the quick [brown fox]^2.5 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.BRACKET, true,
+ false,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+ truth.setBoost(new Float(2.5));
+
+ executeSingleTokenTest(
+ "the quick [brown fox]~^2.5 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.BRACKET, true,
+ true,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+ truth.setBoost(new Float(2.5));
+ executeSingleTokenTest(
+ "the quick [brown fox]~>^2.5 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.BRACKET, true,
+ false,
+ 3);
+ truth.setBoost(new Float(2.5));
+
+ executeSingleTokenTest(
+ "the quick [brown fox]~3^2.5 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNearClause(3, 5, TYPE.BRACKET, true,
+ true,
+ 3);
+ truth.setBoost(new Float(2.5));
+
+ executeSingleTokenTest(
+ "the quick [brown fox]~>3^2.5 jumped",
+ 2,
+ truth
+ );
+ }
+
+ @Test
+ public void testBoosts() throws Exception {
+ String s = "apache^4";
+ List tokens = lexer.getTokens(s);
+ SQPToken t = tokens.get(0);
+ assertEquals(new Float(4), new Float(((SQPBoostableToken)t).getBoost()));
+ assertTrue(t instanceof SQPTerm);
+
+ s = "/apache/^4";
+ tokens = lexer.getTokens(s);
+ t = tokens.get(0);
+ assertEquals(new Float(4), new Float(((SQPBoostableToken)t).getBoost()));
+ assertTrue(t instanceof SQPRegexTerm);
+
+ s = "the [abc TO efg]^4 cat" ;
+ tokens = lexer.getTokens(s);
+ t = tokens.get(1);
+ assertEquals(new Float(4), new Float(((SQPBoostableToken)t).getBoost()));
+ assertTrue(t instanceof SQPRangeTerm);
+
+ s = "apache^.4";
+ tokens = lexer.getTokens(s);
+ t = tokens.get(0);
+ assertEquals(new Float(0.4), new Float(((SQPBoostableToken)t).getBoost()));
+ assertTrue(t instanceof SQPTerm);
+
+ s = "apache^0.4";
+ tokens = lexer.getTokens(s);
+ t = tokens.get(0);
+ assertEquals(new Float(0.4), new Float(((SQPBoostableToken)t).getBoost()));
+ assertTrue(t instanceof SQPTerm);
+
+ //negatives should not be parsed as boosts, boost for these should be UNSPECIFIED_BOOST
+ s = "apache^-4";
+ tokens = lexer.getTokens(s);
+ t = tokens.get(0);
+ assertEquals(new Float(SpanQueryParserBase.UNSPECIFIED_BOOST), new Float(((SQPBoostableToken)t).getBoost()));
+ assertTrue(t instanceof SQPTerm);
+
+ s = "apache^-.4";
+ tokens = lexer.getTokens(s);
+ t = tokens.get(0);
+ assertEquals(new Float(SpanQueryParserBase.UNSPECIFIED_BOOST), new Float(((SQPBoostableToken)t).getBoost()));
+ assertTrue(t instanceof SQPTerm);
+
+ s = "apache^-0.4";
+ tokens = lexer.getTokens(s);
+ t = tokens.get(0);
+ assertEquals(new Float(SpanQueryParserBase.UNSPECIFIED_BOOST), new Float(((SQPBoostableToken)t).getBoost()));
+ assertTrue(t instanceof SQPTerm);
+
+ }
+
+ @Test
+ public void testNotNear() throws ParseException{
+ SQPNotNearClause truth = new SQPNotNearClause(3, 5, TYPE.QUOTE,
+ SQPNotNearClause.NOT_DEFAULT, SQPNotNearClause.NOT_DEFAULT);
+
+ executeSingleTokenTest(
+ "the quick \"brown fox\"!~ jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNotNearClause(3, 5, TYPE.QUOTE,
+ 3, 3);
+ executeSingleTokenTest(
+ "the quick \"brown fox\"!~3 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNotNearClause(3, 5, TYPE.QUOTE,
+ 3, 4);
+ executeSingleTokenTest(
+ "the quick \"brown fox\"!~3,4 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNotNearClause(3, 5, TYPE.BRACKET,
+ SQPNotNearClause.NOT_DEFAULT,
+ SQPNotNearClause.NOT_DEFAULT);
+
+ executeSingleTokenTest(
+ "the quick [brown fox]!~ jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNotNearClause(3, 5, TYPE.BRACKET,
+ 3,
+ 3);
+ executeSingleTokenTest(
+ "the quick [brown fox]!~3 jumped",
+ 2,
+ truth
+ );
+
+ truth = new SQPNotNearClause(3, 5, TYPE.BRACKET,
+ 3,
+ 4);
+ executeSingleTokenTest(
+ "the quick [brown fox]!~3,4 jumped",
+ 2,
+ truth
+ );
+ }
+
+ @Test
+ public void testUnescapes() throws ParseException{
+ //lexer should unescape field names
+ //and boolean operators but nothing else
+ //the parser may need the escapes for determining type of multiterm
+ //and a few other things
+
+ executeSingleTokenTest(
+ "the qu\\(ck",
+ 1,
+ new SQPTerm("qu\\(ck")
+ );
+
+ executeSingleTokenTest(
+ "the qu\\[ck",
+ 1,
+ new SQPTerm("qu\\[ck")
+ );
+
+ executeSingleTokenTest(
+ "the qu\\+ck",
+ 1,
+ new SQPTerm("qu\\+ck")
+ );
+ executeSingleTokenTest(
+ "the qu\\-ck",
+ 1,
+ new SQPTerm("qu\\-ck")
+ );
+
+ executeSingleTokenTest(
+ "the qu\\\\ck",
+ 1,
+ new SQPTerm("qu\\\\ck")
+ );
+
+ executeSingleTokenTest(
+ "the qu\\ ck",
+ 1,
+ new SQPTerm("qu\\ ck")
+ );
+
+ executeSingleTokenTest(
+ "the field\\: quick",
+ 1,
+ new SQPTerm("field\\:")
+ );
+
+ executeSingleTokenTest(
+ "the quick \\AND nimble",
+ 2,
+ new SQPTerm("AND")
+ );
+
+ executeSingleTokenTest(
+ "the quick \\NOT nimble",
+ 2,
+ new SQPTerm("NOT")
+ );
+
+ executeSingleTokenTest(
+ "the quick \\OR nimble",
+ 2,
+ new SQPTerm("OR")
+ );
+
+ executeSingleTokenTest(
+ "the \\+ (quick -nimble)",
+ 1,
+ new SQPTerm("\\+")
+ );
+ }
+
+
+ @Test
+ public void testBoolean() throws Exception{
+
+ executeSingleTokenTest(
+ "the quick AND nimble",
+ 2,
+ new SQPBooleanOpToken(SpanQueryParserBase.CONJ_AND)
+ );
+
+ executeSingleTokenTest(
+ "the quick NOT nimble",
+ 2,
+ new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT)
+ );
+
+ executeSingleTokenTest(
+ "the (quick NOT nimble) fox",
+ 3,
+ new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT)
+ );
+
+
+ //not sure this is the right behavior
+ //lexer knows when it is in a near clause and doesn't parse
+ //boolean operators
+ executeSingleTokenTest(
+ "the [quick NOT nimble] fox",
+ 3,
+ new SQPTerm("NOT")
+ );
+
+ executeSingleTokenTest(
+ "the +quick +nimble",
+ 1,
+ new SQPBooleanOpToken(SpanQueryParserBase.MOD_REQ)
+ );
+
+ executeSingleTokenTest(
+ "the +quick -nimble",
+ 3,
+ new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT)
+ );
+
+ executeSingleTokenTest(
+ "the +(quick -nimble)",
+ 1,
+ new SQPBooleanOpToken(SpanQueryParserBase.MOD_REQ)
+ );
+
+ executeSingleTokenTest(
+ "the +(quick -nimble)",
+ 4,
+ new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT)
+ );
+
+ }
+
+ @Test
+ public void testRange() throws ParseException{
+ executeSingleTokenTest(
+ "the [abc TO def] cat",
+ 1,
+ new SQPRangeTerm("abc", "def", true, true)
+ );
+
+ executeSingleTokenTest(
+ "the [quick brown ([abc TO def] fox)] cat",
+ 5,
+ new SQPRangeTerm("abc", "def", true, true)
+ );
+
+ SQPNearClause nearClause = new SQPNearClause(2, 5,
+ TYPE.BRACKET, false,
+ SQPNearClause.UNSPECIFIED_IN_ORDER,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+
+
+
+ executeSingleTokenTest(
+ "the [abc to def] cat",
+ 1,
+ nearClause
+ );
+
+ executeSingleTokenTest(
+ "the [abc \\TO def] cat",
+ 1,
+ nearClause
+ );
+
+ nearClause = new SQPNearClause(1, 4,
+ TYPE.BRACKET, false,
+ SQPNearClause.UNSPECIFIED_IN_ORDER,
+ SpanQueryParserBase.UNSPECIFIED_SLOP);
+ executeSingleTokenTest(
+ "[abc to def]",
+ 0,
+ nearClause
+ );
+
+ //not ranges
+ nearClause = new SQPNearClause(2, 5,
+ TYPE.BRACKET, true,
+ false,
+ 3);
+
+ executeSingleTokenTest(
+ "the [abc to def]~3 cat",
+ 1,
+ nearClause
+ );
+
+ executeSingleTokenTest(
+ "the [abc TO def]~3 cat",
+ 1,
+ nearClause
+ );
+
+ SQPNotNearClause notNear = new SQPNotNearClause(2,
+ 5, TYPE.BRACKET,
+ 1,
+ 2);
+
+ executeSingleTokenTest(
+ "the [abc TO def]!~1,2 cat",
+ 1,
+ notNear
+ );
+
+
+
+ //terms in range queries aren't checked for multiterm-hood
+ executeSingleTokenTest(
+ "the [abc~2 TO def] cat",
+ 1,
+ new SQPRangeTerm("abc~2", "def", true, true)
+ );
+
+ //terms in range queries aren't checked for multiterm-hood
+ executeSingleTokenTest(
+ "the [abc* TO *def] cat",
+ 1,
+ new SQPRangeTerm("abc*", "*def", true, true)
+ );
+
+ //\\TO is not unescaped currently
+ executeSingleTokenTest(
+ "the [abc \\TO def] cat",
+ 3,
+ new SQPTerm("\\TO")
+ );
+
+
+ }
+ @Test
+ public void testBeyondBMP() throws Exception {
+ String bigChar = new String(new int[]{100000}, 0, 1);
+ String s = "ab"+bigChar+"cd";
+ executeSingleTokenTest(
+ s,
+ 0,
+ new SQPTerm(s)
+ );
+
+ }
+ private void executeSingleTokenTest(String q, int targetOffset, SQPToken truth)
+ throws ParseException{
+ List tokens = lexer.getTokens(q);
+ SQPToken target = tokens.get(targetOffset);
+ assertEquals(truth, target);
+ if (target instanceof SQPBoostableToken && truth instanceof SQPBoostableToken){
+ assertEquals(((SQPBoostableToken)truth).getBoost(),
+ ((SQPBoostableToken)target).getBoost(), 0.00001f);
+ }
+ }
+
+}
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java (revision 0)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java (revision 0)
@@ -0,0 +1,759 @@
+package org.apache.lucene.queryparser.spans;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.io.IOException;
+import java.io.Reader;
+
+import static org.apache.lucene.util.automaton.BasicAutomata.makeString;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReaderContext;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.spans.SpanOnlyParser;
+import org.apache.lucene.queryparser.spans.AnalyzingQueryParserBase.NORM_MULTI_TERMS;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TotalHitCountCollector;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.util._TestUtil;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestSpanOnlyQueryParser extends LuceneTestCase {
+
+ private static IndexReader reader;
+ private static IndexSearcher searcher;
+ private static Directory directory;
+ private static Analyzer stopAnalyzer;
+ private static Analyzer noStopAnalyzer;
+ private static final String FIELD = "f1";
+ private static final Version VERSION = Version.LUCENE_50;
+
+ private static final CharacterRunAutomaton STOP_WORDS = new CharacterRunAutomaton(
+ BasicOperations.union(Arrays.asList(makeString("a"), makeString("an"),
+ makeString("and"), makeString("are"), makeString("as"),
+ makeString("at"), makeString("be"), makeString("but"),
+ makeString("by"), makeString("for"), makeString("if"),
+ makeString("in"), makeString("into"), makeString("is"),
+ makeString("it"), makeString("no"), makeString("not"),
+ makeString("of"), makeString("on"), makeString("or"),
+ makeString("such"), makeString("that"), makeString("the"),
+ makeString("their"), makeString("then"), makeString("there"),
+ makeString("these"), makeString("they"), makeString("this"),
+ makeString("to"), makeString("was"), makeString("will"),
+ makeString("with"), makeString("\u5927"))));
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+
+ noStopAnalyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE,
+ true);
+ TokenFilter filter = new MockStandardTokenizerFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+
+ stopAnalyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE,
+ true);
+ TokenFilter filter = new MockStandardTokenizerFilter(tokenizer);
+ filter = new MockTokenFilter(filter, STOP_WORDS);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+
+ directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, stopAnalyzer)
+ .setMaxBufferedDocs(_TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy()));
+ String[] docs = new String[] {
+ "the quick brown fox ",
+ "jumped over the lazy brown dog and the brown green cat",
+ "quick green fox",
+ "abcdefghijk",
+ "over green lazy",
+ // longish doc for recursion test
+ "eheu fugaces postume postume labuntur anni nec "
+ + "pietas moram rugis et instanti senectae "
+ + "adferet indomitaeque morti",
+ // non-whitespace language
+ "\u666E \u6797 \u65AF \u987F \u5927 \u5B66",
+ "reg/exp",
+ "/regex/",
+ "fuzzy~0.6",
+ "wil*card",
+ "wil?card",
+ "prefi*",
+
+ };
+
+ for (int i = 0; i < docs.length; i++) {
+ Document doc = new Document();
+ doc.add(newTextField(FIELD, docs[i], Field.Store.YES));
+ writer.addDocument(doc);
+ }
+ reader = writer.getReader();
+ searcher = new IndexSearcher(reader);
+ writer.close();
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ reader.close();
+ directory.close();
+ reader = null;
+ directory = null;
+ stopAnalyzer = null;
+ noStopAnalyzer = null;
+ }
+
+ @Test
+ public void testBasic() throws Exception {
+
+ SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, stopAnalyzer);
+
+ // test null and empty
+ boolean ex = false;
+ try{
+ countSpansDocs(p, null, 0, 0);
+
+ } catch (NullPointerException e){
+ ex = true;
+ }
+ assertEquals(true, ex);
+ countSpansDocs(p, "", 0, 0);
+
+ countSpansDocs(p, "brown", 3, 2);
+
+ }
+
+ @Test
+ public void testNear() throws Exception {
+ SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer);
+
+ boolean exc = false;
+
+ try {
+ SpanQuery q = (SpanQuery)p.parse("\"brown \"dog\"");
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("unmatching \"", true, exc);
+
+ exc = false;
+ try {
+ SpanQuery q = (SpanQuery)p.parse("[brown [dog]");
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("unmatched [", true, exc);
+
+ testOffsetForSingleSpanMatch(p, "\"brown dog\"", 1, 4, 6);
+
+ countSpansDocs(p, "\"lazy dog\"", 0, 0);
+
+ testOffsetForSingleSpanMatch(p, "\"lazy dog\"~2", 1, 3, 6);
+
+ testOffsetForSingleSpanMatch(p, "\"lazy dog\"~>2", 1, 3, 6);
+
+ testOffsetForSingleSpanMatch(p, "\"dog lazy\"~2", 1, 3, 6);
+
+ countSpansDocs(p, "\"dog lazy\"~>2", 0, 0);
+
+ testOffsetForSingleSpanMatch(p, "[\"lazy dog\"~>2 cat]~10", 1, 3, 11);
+
+ testOffsetForSingleSpanMatch(p, "[\"lazy dog\"~>2 cat]~>10", 1, 3, 11);
+
+ countSpansDocs(p, "[cat \"lazy dog\"~>2]~>10", 0, 0);
+
+ // shows that "intervening" for multiple terms is additive
+ // 3 includes "over the" and "brown"
+ testOffsetForSingleSpanMatch(p, "[jumped lazy dog]~3", 1, 0, 6);
+
+ // only two words separate each hit, but together, the intervening words > 2
+ countSpansDocs(p, "[jumped lazy dog]~2", 0, 0);
+
+ }
+
+ @Test
+ public void testNotNear() throws Exception {
+ SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer);
+ boolean exc = false;
+ try {
+ SpanQuery q = (SpanQuery)p.parse("\"brown dog car\"!~2,2");
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("must have 2 components", true, exc);
+
+ countSpansDocs(p, "\"brown dog\"!~2,2", 2, 2);
+
+ testOffsetForSingleSpanMatch(p, "\"brown (green dog)\"!~1,1", 0, 2, 3);
+
+ countSpansDocs(p, "\"brown (cat dog)\"!~1,1", 2, 2);
+
+ countSpansDocs(p, "\"brown (quick lazy)\"!~0,4", 3, 2);
+
+ countSpansDocs(p, "\"brown quick\"!~1,4", 2, 1);
+
+ testOffsetForSingleSpanMatch(p, "\"brown (quick lazy)\"!~1,4", 1, 8, 9);
+
+ // test empty
+ countSpansDocs(p, "\"z y\"!~0,4", 0, 0);
+
+ testOffsetForSingleSpanMatch(p, "[[quick fox]~3 brown]!~1,1", 2, 0, 3);
+
+ // traditional SpanNotQuery
+ testOffsetForSingleSpanMatch(p, "[[quick fox]~3 brown]!~", 2, 0, 3);
+
+ }
+
+ @Test
+ public void testWildcard() throws Exception {
+ SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer);
+ boolean exc = false;
+ //default: don't allow leading wildcards
+
+ try {
+ SpanQuery q = (SpanQuery)p.parse("*og");
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("no leading wildcards \"", true, exc);
+ p.setAllowLeadingWildcard(true);
+
+ // lowercasing as default
+ testOffsetForSingleSpanMatch(p, "*OG", 1, 5, 6);
+
+ p.setNormMultiTerms(NORM_MULTI_TERMS.NONE);
+
+ countSpansDocs(p, "*OG", 0, 0);
+
+ testOffsetForSingleSpanMatch(p, "*og", 1, 5, 6);
+ testOffsetForSingleSpanMatch(p, "?og", 1, 5, 6);
+
+ // brown dog and brown fox
+ countSpansDocs(p, "[brown ?o?]", 2, 2);
+ countSpansDocs(p, "[br* ?o?]", 2, 2);
+ }
+
+ @Test
+ public void testPrefix() throws Exception {
+ SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer);
+
+ // lowercasing as default
+ countSpansDocs(p, "BR*", 3, 2);
+
+ countSpansDocs(p, "br*", 3, 2);
+
+ p.setNormMultiTerms(NORM_MULTI_TERMS.NONE);
+ countSpansDocs(p, "BR*", 0, 0);
+
+ // not actually a prefix query
+ countSpansDocs(p, "br?", 0, 0);
+
+ p.setAllowLeadingWildcard(true);
+ countSpansDocs(p, "*", 45, 13);
+
+ }
+
+ @Test
+ public void testRegex() throws Exception {
+ SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer);
+
+
+ countSpansDocs(p, "/b[wor]+n/", 3, 2);
+ countSpansDocs(p, " /b[wor]+n/ ", 3, 2);
+
+ testOffsetForSingleSpanMatch(p, " [/b[wor]+n/ fox]", 0, 2, 4);
+
+ testOffsetForSingleSpanMatch(p, " [/b[wor]+n/fox]", 0, 2, 4);
+
+ countSpansDocs(p, " [/b[wor]+n/ (fox dog)]", 2, 2);
+
+ //default is to set to lowercase
+ countSpansDocs(p, "/B[wor]+n/", 3, 2);
+
+ p.setNormMultiTerms(NORM_MULTI_TERMS.NONE);
+ countSpansDocs(p, "/B[wor]+n/", 0, 0);
+
+ //test special regex escape
+ countSpansDocs(p, "/reg\\/exp/", 1, 1);
+ }
+
+ @Test
+ public void testFuzzy() throws Exception {
+ //could use more testing of requested and fuzzyMinSim < 1.0f
+ SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer);
+
+ countSpansDocs(p, "bruun~", 3, 2);
+ countSpansDocs(p, "bruun~2", 3, 2);
+
+ //default should reduce 3 to 2 and therefore not have any hits
+ countSpansDocs(p, "abcdefgh~3", 0, 0);
+
+ p.setFuzzyMinSim(3.0f);
+ testOffsetForSingleSpanMatch(p, "abcdefgh~3", 3, 0, 1);
+
+ // default lowercasing
+ testOffsetForSingleSpanMatch(p, "Abcdefgh~3", 3, 0, 1);
+ p.setNormMultiTerms(NORM_MULTI_TERMS.NONE);
+ countSpansDocs(p, "Abcdefgh~3", 0, 0);
+
+ countSpansDocs(p, "brwon~1", 3, 2);
+ countSpansDocs(p, "brwon~>1", 0, 0);
+
+ countSpansDocs(p, "crown~1,1", 0, 0);
+ countSpansDocs(p, "crown~2,1", 0, 0);
+ countSpansDocs(p, "crown~3,1", 0, 0);
+ countSpansDocs(p, "brwn~1,1", 3, 2);
+
+ p.setFuzzyMinSim(0.6f);
+ countSpansDocs(p, "brwon~0.80", 3, 2);
+
+ p.setFuzzyMinSim(0.85f);
+ countSpansDocs(p, "brwon~0.80", 0, 0);
+
+ p.setFuzzyMinSim(0.80f);
+
+ countSpansDocs(p, "brwon~2", 3, 2);
+
+ p.setFuzzyMinSim(0.60f);
+ //this requires edit = 3
+ testOffsetForSingleSpanMatch(p, "abcdefgh~0.60", 3, 0, 1);
+
+ p.setFuzzyMinSim(0.65f);
+ //this requires edit = 3, 63%
+ countSpansDocs(p, "abcdefgh~0.60", 0, 0);
+
+ //fuzzy val of 0 should yield straight SpanTermQuery
+ Query q = p.parse("brown~0.0");
+ assertTrue("fuzzy val = 0.0", q instanceof SpanTermQuery);
+ q = p.parse("brown~0");
+ assertTrue("fuzzy val = 0", q instanceof SpanTermQuery);
+
+ }
+
+ @Test
+ public void testStopWords() throws Exception {
+ // Stop word handling has some room for improvement with SpanQuery
+ // These tests codify the expectations (for regular behavior,
+ // parse exceptions and false hits) as of this writing.
+
+ SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, stopAnalyzer);
+
+ countSpansDocs(p, "the", 0, 0);
+
+ // these are whittled down to just a query for brown
+ countSpansDocs(p, "[the brown]", 3, 2);
+
+ countSpansDocs(p, "(the brown)", 3, 2);
+
+ testException(p, "[brown the]!~5,5");
+
+ // this will not match because "the" is silently dropped from the query
+ countSpansDocs(p, "[over the lazy]", 0, 0);
+
+ // this will get one right hit, but incorrectly match "over green lazy"
+ countSpansDocs(p, "[over the lazy]~1", 2, 2);
+
+ // test throw exception
+ p.setThrowExceptionForEmptyTerm(true);
+ p.setNormMultiTerms(NORM_MULTI_TERMS.ANALYZE);
+
+ String[] stopExs = new String[]{
+ "the",
+ "[the brown]",
+ "the brown",
+ "(the brown)",
+ "\"the brown\"",
+ "\"the\"",
+ "[the brown]!~2,2",
+ "[brown the]!~2,2",
+ "the*ter",
+ "the?ter"
+ };
+ for (String ex : stopExs){
+ testException(p, ex);
+ }
+
+ // add tests for surprise phrasal with stopword!!! chinese
+
+ SpanOnlyParser noStopsParser = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer);
+ noStopsParser.setAutoGeneratePhraseQueries(true);
+ // won't match because stop word was dropped in index
+ countSpansDocs(noStopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0);
+ // won't match for same reason
+ countSpansDocs(noStopsParser, "[\u666E\u6797\u65AF\u987F\u5927\u5B66]~2",
+ 0, 0);
+
+ testOffsetForSingleSpanMatch(noStopsParser,
+ "[\u666E \u6797 \u65AF \u987F \u5B66]~2", 6, 0, 6);
+
+ }
+
+
+
+ @Test
+ public void testNonWhiteSpaceLanguage() throws Exception {
+ SpanOnlyParser noStopsParser = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer);
+
+ testOffsetForSingleSpanMatch(noStopsParser, "\u666E", 6, 0, 1);
+
+ countSpansDocs(noStopsParser, "\u666E\u6797", 2, 1);
+
+ countSpansDocs(noStopsParser, "\u666E\u65AF", 2, 1);
+
+ noStopsParser.setAutoGeneratePhraseQueries(true);
+
+ testOffsetForSingleSpanMatch(noStopsParser, "\u666E\u6797", 6, 0, 2);
+
+ // this would have a hit if autogenerate phrase queries = false
+ countSpansDocs(noStopsParser, "\u666E\u65AF", 0, 0);
+
+ // treat as "or", this should have two spans
+ countSpansDocs(noStopsParser, "\u666E \u65AF", 2, 1);
+
+ // stop word removed at indexing time and non existent here,
+ // this is treated as an exact phrase and should not match
+ countSpansDocs(noStopsParser, "\u666E\u6797\u65AF\u987F\u5B66", 0, 0);
+
+ // this should be the same as above
+ countSpansDocs(noStopsParser, "[\u666E \u6797 \u65AF \u987F \u5B66]~0", 0,
+ 0);
+
+ // look for the same phrase but allow for some slop; this should have one
+ // hit because this will skip the stop word
+
+ testOffsetForSingleSpanMatch(noStopsParser,
+ "[\u666E \u6797 \u65AF \u987F \u5B66]~1", 6, 0, 6);
+
+ // This tests the #specialHandlingForSpanNearWithOneComponent
+ // this is initially treated as [ [\u666E\u6797\u65AF\u5B66]~>0 ]~2
+ // with the special treatment, this is rewritten as
+ // [\u666E \u6797 \u65AF \u5B66]~2
+ testOffsetForSingleSpanMatch(noStopsParser,
+ "[\u666E\u6797\u65AF\u5B66]~2", 6, 0, 6);
+
+ //If someone enters in a space delimited phrase within a phrase,
+ //treat it literally. There should be no matches.
+ countSpansDocs(noStopsParser, "[[lazy dog] ]~4", 0, 0);
+
+ noStopsParser.setAutoGeneratePhraseQueries(false);
+
+ // characters split into 2 tokens and treated as an "or" query
+ countSpansDocs(noStopsParser, "\u666E\u65AF", 2, 1);
+
+ // TODO: Not sure i like how this behaves.
+ // this is treated as [(\u666E \u6797 \u65AF \u987F \u5B66)]~1
+ // which is then simplified to just: (\u666E \u6797 \u65AF \u987F \u5B66)
+ // Probably better to be treated as [\u666E \u6797 \u65AF \u987F \u5B66]~1
+
+ testOffsetForSingleSpanMatch(noStopsParser,
+ "[\u666E\u6797\u65AF\u987F\u5B66]~1", 6, 0, 6);
+
+ SpanOnlyParser stopsParser = new SpanOnlyParser(VERSION, FIELD, stopAnalyzer);
+ stopsParser.setAutoGeneratePhraseQueries(true);
+ countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0);
+
+ // now test for throwing of exception
+ stopsParser.setThrowExceptionForEmptyTerm(true);
+ boolean exc = false;
+ try {
+ countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0);
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals(true, exc);
+ }
+
+ @Test
+ public void testQuotedSingleTerm() throws Exception{
+ SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer);
+
+ String[] quoteds = new String[]{
+ "/regex/",
+ "fuzzy~0.6",
+ "wil*card",
+ "wil?card",
+ "prefi*"};
+
+ for (String q : quoteds){
+ countSpansDocs(p, "\""+q+"\"", 1, 1);
+ }
+ }
+
+ @Test
+ public void testRangeQueries() throws Exception {
+ //TODO: add tests, now fairly well covered by TestSPanQPBasedonQPTestBase
+ }
+
+
+
+ @Test
+ public void testRecursion() throws Exception {
+ /*
+ * For easy reference of expected offsets
+ *
+ * 0: eheu 1: fugaces 2: postume 3: postume 4: labuntur 5: anni 6: nec 7:
+ * pietas 8: moram 9: rugis 10: et 11: instanti 12: senectae 13: adferet 14:
+ * indomitaeque 15: morti
+ */
+ SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer);
+
+ // String q = "[labunt* [pietas [rug?s senec*]!~2,0 ]~4 adferet]~5";
+ // String q = "[pietas [rug?s senec*]!~2,0 ]~4";
+ // countSpansDocs(p, q, 1, 1);
+
+ // Span extents end at one more than the actual end, e.g.:
+ String q = "fugaces";
+ testOffsetForSingleSpanMatch(p, q, 5, 1, 2);
+
+ q = "morti";
+ testOffsetForSingleSpanMatch(p, q, 5, 15, 16);
+
+ q = "[labunt* [pietas [rug?s senec*]~2 ]~4 adferet]~2";
+ testOffsetForSingleSpanMatch(p, q, 5, 4, 14);
+
+ // not near query for rugis senectae
+ q = "[labunt* [pietas [rug?s senec*]!~2 ]~4 adferet]~2";
+ countSpansDocs(p, q, 0, 0);
+
+ // not near query for rugis senectae, 0 before or 2 after
+ // Have to extend overall distance to 5 because hit for
+ // "rug?s senec*" matches only "rug?s" now
+ q = "[labunt* [pietas [rug?s senec*]!~2,0 ]~4 adferet]~5";
+ testOffsetForSingleSpanMatch(p, q, 5, 4, 14);
+
+ // not near query for rugis senectae, 0 before or 2 intervening
+ q = "[labunt* [pietas [rug?s senec*]!~0,2 ]~4 adferet]~5";
+ testOffsetForSingleSpanMatch(p, q, 5, 4, 14);
+
+ // not near query for rugis senectae, 0 before or 3 intervening
+ q = "[labunt* [pietas [rug?s senec*]!~0,3 ]~4 adferet]~2";
+ countSpansDocs(p, q, 0, 0);
+
+ // directionality specified
+ q = "[labunt* [pietas [rug?s senec*]~>2 ]~>4 adferet]~>2";
+ testOffsetForSingleSpanMatch(p, q, 5, 4, 14);
+
+ // no directionality, query order inverted
+ q = "[adferet [ [senec* rug?s ]~2 pietas ]~4 labunt*]~2";
+ testOffsetForSingleSpanMatch(p, q, 5, 4, 14);
+
+ // more than one word intervenes btwn rugis and senectae
+ q = "[labunt* [pietas [rug?s senec*]~1 ]~4 adferet]~2";
+ countSpansDocs(p, q, 0, 0);
+
+ // more than one word intervenes btwn labuntur and pietas
+ q = "[labunt* [pietas [rug?s senec*]~2 ]~4 adferet]~1";
+ countSpansDocs(p, q, 0, 0);
+ }
+
+ private void testException(SpanOnlyParser p, String q) throws Exception{
+ boolean ex = false;
+ try{
+ countSpansDocs(p, q, 3, 2);
+ } catch (ParseException e){
+ ex = true;
+ }
+ assertTrue(q, ex);
+
+
+ }
+ private void countSpansDocs(SpanOnlyParser p, String s, int spanCount,
+ int docCount) throws Exception {
+ SpanQuery q = (SpanQuery)p.parse(s);
+ assertEquals("spanCount: " + s, spanCount, countSpans(q));
+ assertEquals("docCount: " + s, docCount, countDocs(q));
+ }
+
+ private long countSpans(SpanQuery q) throws Exception {
+ List ctxs = reader.leaves();
+ assert (ctxs.size() == 1);
+ AtomicReaderContext ctx = ctxs.get(0);
+ q = (SpanQuery) q.rewrite(ctx.reader());
+ Spans spans = q.getSpans(ctx, null, new HashMap());
+
+ long i = 0;
+ while (spans.next()) {
+ i++;
+ }
+ return i;
+ }
+
+ private long countDocs(SpanQuery q) throws Exception {
+ OpenBitSet docs = new OpenBitSet();
+ List ctxs = reader.leaves();
+ assert (ctxs.size() == 1);
+ AtomicReaderContext ctx = ctxs.get(0);
+ IndexReaderContext parentCtx = reader.getContext();
+ q = (SpanQuery) q.rewrite(ctx.reader());
+
+ Set qTerms = new HashSet();
+ q.extractTerms(qTerms);
+ Map termContexts = new HashMap();
+
+ for (Term t : qTerms) {
+ TermContext c = TermContext.build(parentCtx, t);
+ termContexts.put(t, c);
+ }
+
+ Spans spans = q.getSpans(ctx, null, termContexts);
+
+ while (spans.next()) {
+ docs.set(spans.doc());
+ }
+ long spanDocHits = docs.cardinality();
+ // double check with a regular searcher
+ TotalHitCountCollector coll = new TotalHitCountCollector();
+ searcher.search(q, coll);
+ assertEquals(coll.getTotalHits(), spanDocHits);
+ return spanDocHits;
+
+ }
+
+ private void testOffsetForSingleSpanMatch(SpanOnlyParser p, String s,
+ int trueDocID, int trueSpanStart, int trueSpanEnd) throws Exception {
+ SpanQuery q = (SpanQuery)p.parse(s);
+ List ctxs = reader.leaves();
+ assert (ctxs.size() == 1);
+ AtomicReaderContext ctx = ctxs.get(0);
+ q = (SpanQuery) q.rewrite(ctx.reader());
+ Spans spans = q.getSpans(ctx, null, new HashMap());
+
+ int i = 0;
+ int spanStart = -1;
+ int spanEnd = -1;
+ int docID = -1;
+ while (spans.next()) {
+ spanStart = spans.start();
+ spanEnd = spans.end();
+ docID = spans.doc();
+ i++;
+ }
+ assertEquals("should only be one matching span", 1, i);
+ assertEquals("doc id", trueDocID, docID);
+ assertEquals("span start", trueSpanStart, spanStart);
+ assertEquals("span end", trueSpanEnd, spanEnd);
+ }
+
+
+ /**
+ * Mocks StandardAnalyzer for tokenizing Chinese characters (at least for
+ * these test cases into individual tokens).
+ *
+ */
+ private final static class MockStandardTokenizerFilter extends TokenFilter {
+ // Only designed to handle test cases. You may need to modify this
+ // if adding new test cases. Note that position increment is hardcoded to be
+ // 1!!!
+ private final Pattern hackCJKPattern = Pattern
+ .compile("([\u5900-\u9899])|([\\p{InBasic_Latin}]+)");
+ private List buffer = new LinkedList();
+
+ private final CharTermAttribute termAtt;
+ private final PositionIncrementAttribute posIncrAtt;
+
+ public MockStandardTokenizerFilter(TokenStream in) {
+ super(in);
+ termAtt = addAttribute(CharTermAttribute.class);
+ posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ }
+
+ @Override
+ public final boolean incrementToken() throws java.io.IOException {
+ if (buffer.size() > 0) {
+ termAtt.setEmpty().append(buffer.remove(0));
+ posIncrAtt.setPositionIncrement(1);
+ return true;
+ } else {
+ boolean next = input.incrementToken();
+ if (!next) {
+ return false;
+ }
+ // posIncrAtt.setPositionIncrement(1);
+ String text = termAtt.toString();
+ Matcher m = hackCJKPattern.matcher(text);
+ boolean hasCJK = false;
+ while (m.find()) {
+ if (m.group(1) != null) {
+ hasCJK = true;
+ buffer.add(m.group(1));
+ } else if (m.group(2) != null) {
+ buffer.add(m.group(2));
+ }
+ }
+ if (hasCJK == false) {
+ // don't change the position increment, the super class will handle
+ // stop words properly
+ buffer.clear();
+ return true;
+ }
+ if (buffer.size() > 0) {
+ termAtt.setEmpty().append(buffer.remove(0));
+ posIncrAtt.setPositionIncrement(1);
+ }
+ return true;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ }
+ }
+}
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java (revision 0)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java (revision 0)
@@ -0,0 +1,304 @@
+package org.apache.lucene.queryparser.spans;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.spans.SpanQueryParser;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopScoreDocCollector;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.util._TestUtil;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestOverallSpanQueryParser extends LuceneTestCase{
+ private final static String FIELD1 = "f1";
+ private final static String FIELD2 = "f2";
+ private static Analyzer analyzer = null;
+ private static Directory directory = null;
+ private static IndexReader reader = null;
+ private static IndexSearcher searcher = null;
+ private static SpanQueryParser parser;
+ private final static Version VERSION = Version.LUCENE_50;
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ analyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE,
+ false);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+ directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)
+ .setMaxBufferedDocs(_TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy()));
+ String[] f1Docs = new String[] {
+ "quick brown AND fox",//0
+ "quick brown AND dog", //1
+ "quick brown dog", //2
+ "whan that aprile with its shoures perced", //3
+ "its shoures pierced", //4
+ "its shoures perced", //5
+ "#####", //before asterisk //6
+ "&&&&&", //after asterisk for range query //7
+ "ab*de", //8
+ "abcde" //9
+
+ };
+ String [] f2Docs = new String[] {
+ "zero",
+ "one",
+ "two",
+ "three",
+ "four",
+ "five",
+ "six",
+ "seven",
+ "eight",
+ "nine"
+ };
+ for (int i = 0; i < f1Docs.length; i++) {
+ Document doc = new Document();
+ doc.add(newTextField(FIELD1, f1Docs[i], Field.Store.YES));
+ doc.add(newTextField(FIELD2, f2Docs[i], Field.Store.YES));
+ writer.addDocument(doc);
+ }
+ reader = writer.getReader();
+ searcher = new IndexSearcher(reader);
+ writer.close();
+
+ parser = new SpanQueryParser(VERSION, FIELD1, analyzer);
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ reader.close();
+ directory.close();
+ reader = null;
+ searcher = null;
+ directory = null;
+ analyzer = null;
+ }
+
+
+ @Test
+ public void testBooleanQueryConstruction() throws Exception {
+ String s = "cat dog AND elephant aardvark";
+ Query q = parser.parse(s);
+ assertTrue(q instanceof BooleanQuery);
+ BooleanQuery bq = (BooleanQuery)q;
+ List clauses = bq.clauses();
+ assertEquals(4, clauses.size());
+ testForClause(clauses, "cat", Occur.SHOULD);
+ testForClause(clauses, "dog", Occur.MUST);
+ testForClause(clauses, "elephant", Occur.MUST);
+ testForClause(clauses, "aardvark", Occur.SHOULD);
+
+ s = "cat dog NOT elephant aardvark";
+ q = parser.parse(s);
+ assertTrue(q instanceof BooleanQuery);
+ bq = (BooleanQuery)q;
+ clauses = bq.clauses();
+ assertEquals(4, clauses.size());
+ testForClause(clauses, "cat", Occur.SHOULD);
+ testForClause(clauses, "dog", Occur.SHOULD);
+ testForClause(clauses, "elephant", Occur.MUST_NOT);
+ testForClause(clauses, "aardvark", Occur.SHOULD);
+
+ s = "cat +dog -elephant +aardvark";
+ q = parser.parse(s);
+ assertTrue(q instanceof BooleanQuery);
+ bq = (BooleanQuery)q;
+ clauses = bq.clauses();
+ assertEquals(4, clauses.size());
+ testForClause(clauses, "cat", Occur.SHOULD);
+ testForClause(clauses, "dog", Occur.MUST);
+ testForClause(clauses, "elephant", Occur.MUST_NOT);
+ testForClause(clauses, "aardvark", Occur.MUST);
+
+ }
+
+ @Test
+ public void testFields() throws Exception {
+ compareHits("f1:brown f2:three", 0, 1, 2, 3);
+
+ //four should go back to f1
+ compareHits("f1:brown f2:three four", 0, 1, 2, 3);
+ compareHits("f1:brown f2:(three four)", 0, 1, 2, 3, 4);
+ compareHits("f1:brown f2:(three four) five", 0, 1, 2, 3, 4);
+ compareHits("f1:brown f2:(three four) f2:five", 0, 1, 2, 3, 4, 5);
+ compareHits("f1:brown f2:(f1:three four) f2:five", 0, 1, 2, 4, 5);
+
+ SpanQueryParser p = new SpanQueryParser(VERSION, FIELD2, analyzer);
+ compareHits(p, "f1:brown three four", 0, 1, 2, 3, 4);
+ compareHits(p, "f1:brown (three four)", 0, 1, 2, 3, 4);
+ compareHits(p, "f1:brown (three four) five", 0, 1, 2, 3, 4, 5);
+ compareHits(p, "f1:brown (three four) five", 0, 1, 2, 3, 4, 5);
+ compareHits(p, "f1:brown (f1:three four) five", 0, 1, 2, 4, 5);
+
+ }
+ @Test
+ public void testBooleanOrHits() throws Exception {
+ compareHits("f2:three (brown dog)", 0, 1, 2, 3);
+ compareHits("f2:three (brown dog)~2", 1, 2, 3);
+ }
+
+ @Test
+ public void testBooleanHits() throws Exception {
+ //test treatment of AND within phrase
+ compareHits("quick NOT [brown AND (fox dog)]", 2);
+ compareHits("quick AND [bruwn~1 AND (f?x do?)]", 0, 1);
+ compareHits("(whan AND aprile) (shoures NOT perced)", 3, 4);
+ //test escaping of AND
+ compareHits("zoo \\AND elephant", 0, 1);
+ }
+
+
+
+ private void testForClause(List clauses, String term, Occur occur){
+ assertTrue(clauses.contains(
+ new BooleanClause(
+ new SpanTermQuery(
+ new Term(FIELD1, term)),
+ occur))
+ );
+
+ }
+ private void compareHits(String s, int ... docids ) throws Exception{
+ compareHits(new SpanQueryParser(VERSION, FIELD1, analyzer), s, docids);
+ }
+
+ private void compareHits(SpanQueryParser p, String s, int ... docids ) throws Exception{
+ Query q = p.parse(s);
+ TopScoreDocCollector results = TopScoreDocCollector.create(1000, true);
+ searcher.search(q, results);
+ ScoreDoc[] scoreDocs = results.topDocs().scoreDocs;
+ Set hits = new HashSet();
+
+ for (int i = 0; i < scoreDocs.length; i++){
+ hits.add(scoreDocs[i].doc);
+ }
+ assertEquals(docids.length, hits.size());
+
+ for (int i = 0; i < docids.length; i++){
+ assertTrue(hits.contains(docids[i]));
+ }
+
+ }
+
+ @Test
+ public void testExceptions(){
+ String[] strings = new String[]{
+ "cat OR OR dog",
+ "cat OR AND dog",
+ "cat AND AND dog",
+ "cat NOT NOT dog",
+ "cat NOT AND dog",
+ "cat NOT OR dog",
+ "cat NOT -dog",
+ "cat NOT +dog",
+ "OR",
+ "+",
+ "AND dog",
+ "OR dog",
+ "dog AND",
+ "dog OR",
+ "dog NOT",
+ "dog -",
+ "dog +"};
+
+ for (String s : strings){
+ testException(s, parser);
+ }
+ }
+
+ private void testException(String s, SpanQueryParser p){
+ boolean ex = false;
+ try{
+ Query query = p.parse(s);
+ } catch (ParseException e){
+ ex = true;
+ } catch (Exception e){
+
+ }
+ assertTrue(s, ex);
+ }
+
+ @Test
+ public void testIsEscaped() throws Exception{
+
+ String[] notEscaped = new String[]{
+ "abcd",
+ "a\\\\d",
+ };
+ for (String s : notEscaped){
+ assertFalse(s, SpanQueryParserBase.isCharEscaped(s, 3));
+ }
+ String[] escaped = new String[]{
+ "ab\\d",
+ "\\\\\\d",
+ };
+ for (String s : escaped){
+ assertTrue(s, SpanQueryParserBase.isCharEscaped(s, 3));
+ }
+
+ Query q = parser.parse("abc\\~2.0");
+ assertTrue(q.toString(), q instanceof SpanTermQuery);
+ q = parser.parse("abc\\\\\\~2.0");
+ assertTrue(q.toString(), q instanceof SpanTermQuery);
+ q = parser.parse("abc\\\\~2.0");
+ assertTrue(q.toString(), q instanceof SpanMultiTermQueryWrapper);
+
+ q = parser.parse("abc\\*d");
+ assertTrue(q.toString(), q instanceof SpanTermQuery);
+
+ q = parser.parse("abc\\\\\\*d");
+ assertTrue(q.toString(), q instanceof SpanTermQuery);
+
+ q = parser.parse("abc\\\\*d");
+ assertTrue(q.toString(), q instanceof SpanMultiTermQueryWrapper);
+
+ }
+}
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQPBasedOnQPTestBase.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQPBasedOnQPTestBase.java (revision 0)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQPBasedOnQPTestBase.java (revision 0)
@@ -0,0 +1,1223 @@
+package org.apache.lucene.queryparser.spans;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.text.DateFormat;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.GregorianCalendar;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.TimeZone;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.document.DateTools;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
+import org.apache.lucene.queryparser.classic.QueryParser.Operator;
+import org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration;
+import org.apache.lucene.queryparser.spans.SpanQueryParser;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.RegexpQuery;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.RegExp;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+
+public class TestSpanQPBasedOnQPTestBase extends LuceneTestCase {
+
+ public static Analyzer qpAnalyzer;
+ public static String FIELD = "f1";
+
+ @BeforeClass
+ public static void beforeClass() {
+ qpAnalyzer = new QPTestAnalyzer();
+ }
+
+ @AfterClass
+ public static void afterClass() {
+ qpAnalyzer = null;
+ }
+
+ public static final class QPTestFilter extends TokenFilter {
+ CharTermAttribute termAtt;
+ OffsetAttribute offsetAtt;
+
+ /**
+ * Filter which discards the token 'stop' and which expands the
+ * token 'phrase' into 'phrase1 phrase2'
+ */
+ public QPTestFilter(TokenStream in) {
+ super(in);
+ termAtt = addAttribute(CharTermAttribute.class);
+ offsetAtt = addAttribute(OffsetAttribute.class);
+ }
+
+ boolean inPhrase = false;
+ int savedStart = 0, savedEnd = 0;
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (inPhrase) {
+ inPhrase = false;
+ clearAttributes();
+ termAtt.append("phrase2");
+ offsetAtt.setOffset(savedStart, savedEnd);
+ return true;
+ } else
+ while (input.incrementToken()) {
+ if (termAtt.toString().equals("phrase")) {
+ inPhrase = true;
+ savedStart = offsetAtt.startOffset();
+ savedEnd = offsetAtt.endOffset();
+ termAtt.setEmpty().append("phrase1");
+ offsetAtt.setOffset(savedStart, savedEnd);
+ return true;
+ } else if (!termAtt.toString().equals("stop"))
+ return true;
+ }
+ return false;
+ }
+ }
+
+ public static final class QPTestAnalyzer extends Analyzer {
+
+ /** Filters MockTokenizer with StopFilter. */
+ @Override
+ public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+ return new TokenStreamComponents(tokenizer, new QPTestFilter(tokenizer));
+ }
+ }
+
+
+ private int originalMaxClauses;
+
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ originalMaxClauses = BooleanQuery.getMaxClauseCount();
+ }
+
+ public CommonQueryParserConfiguration getParserConfig(Analyzer a) throws Exception{
+ CommonQueryParserConfiguration cqpc = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a);
+ return cqpc;
+ }
+ public Query getQuery(String query) throws Exception {
+ return getQuery(query, (Analyzer)null);
+ }
+
+ private Query getQuery(String query, Analyzer analyzer) throws Exception {
+ Analyzer a = (analyzer == null) ? qpAnalyzer : analyzer;
+ SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a);
+ return p.parse(query);
+ }
+
+ public Query getQuery(String query, CommonQueryParserConfiguration cqpC) throws Exception{
+
+ SpanQueryParser p = (SpanQueryParser)cqpC;
+
+ return p.parse(query);
+ }
+ public void setDateResolution(CommonQueryParserConfiguration cqpC, CharSequence field, DateTools.Resolution value){
+ assert (cqpC instanceof SpanQueryParser);
+ ((SpanQueryParser)cqpC).setDateResolution(field.toString(), value);
+ }
+
+ private void setAutoGeneratePhraseQueries(CommonQueryParserConfiguration qp,
+ boolean b) {
+ assert (qp instanceof SpanQueryParser);
+ ((SpanQueryParser)qp).setAutoGeneratePhraseQueries(b);
+
+ }
+
+ public void assertQueryEquals(String query, Analyzer a, String result)
+ throws Exception {
+ Query q = getQuery(query, a);
+ String s = q.toString(FIELD);
+ if (!s.equals(result)) {
+ fail("Query /" + query + "/ yielded /" + s
+ + "/, expecting /" + result + "/");
+ }
+ }
+
+ public void assertQueryEquals(CommonQueryParserConfiguration cqpC, String field, String query, String result)
+ throws Exception {
+ Query q = getQuery(query, cqpC);
+ String s = q.toString(field);
+ if (!s.equals(result)) {
+ fail("Query /" + query + "/ yielded /" + s
+ + "/, expecting /" + result + "/");
+ }
+ }
+ public void assertBoostEquals(String query, float b)
+ throws Exception {
+ double precision = 0.00001;
+ Query q = getQuery(query);
+ if (Math.abs(q.getBoost() - b) > precision){
+ fail("Query /" + query + "/ yielded boost:" + q.getBoost()
+ + "/, expecting /" + b + "/");
+ }
+ }
+
+ private void assertEqualsWrappedRegexp(RegexpQuery q, Query query) {
+ assertTrue(query instanceof SpanMultiTermQueryWrapper);
+
+ SpanMultiTermQueryWrapper wrapped = new SpanMultiTermQueryWrapper(q);
+
+ assertEquals(wrapped, query);
+ }
+
+ public void assertEscapedQueryEquals(String query, Analyzer a, String result)
+ throws Exception {
+ String escapedQuery = QueryParserBase.escape(query);
+ if (!escapedQuery.equals(result)) {
+ fail("Query /" + query + "/ yielded /" + escapedQuery
+ + "/, expecting /" + result + "/");
+ }
+ }
+
+
+ private void assertMultitermEquals(Query query,
+ String expected) throws Exception {
+ assertMultitermEquals(FIELD, query, expected);
+ }
+ private void assertMultitermEquals(String field, Query query,
+ String expected) throws Exception {
+ expected = "SpanMultiTermQueryWrapper("+field+":"+ expected+")";
+
+ //need to trim final .0 for fuzzy queries because
+ //sometimes they appear in the string and sometimes they don't
+ expected = expected.replace(".0)", ")");
+ String qString = query.toString().replace(".0)", ")");
+ assertEquals(expected, qString);
+
+ }
+
+ private void assertMultitermEquals(String s,
+ String expected) throws Exception {
+ assertMultitermEquals(s, qpAnalyzer, expected);
+ }
+
+ private void assertMultitermEquals(String s,
+ String expected, float boost) throws Exception {
+ Analyzer a = qpAnalyzer;
+ SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a);
+ Query q = p.parse(s);
+ assertMultitermEquals(q, expected);
+ assertEquals(q.getBoost(), boost, 0.000001f);
+ }
+
+ private void assertMultitermEquals(String query, boolean b,
+ String expected) throws Exception {
+ Analyzer a = qpAnalyzer;
+ SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a);
+ p.setLowercaseExpandedTerms(b);
+ Query q = p.parse(query);
+ assertMultitermEquals(q, expected);
+ }
+
+ private void assertMultitermEquals(String field,
+ String query, Analyzer a, String expected) throws Exception{
+ SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a);
+ Query q = p.parse(query);
+ assertMultitermEquals(field, q, expected);
+ }
+
+ private void assertMultitermEquals(String query, Analyzer a, String expected) throws Exception{
+ assertMultitermEquals(FIELD, query, a, expected);
+ }
+
+ private void assertMultitermEquals(String query, boolean lowercase,
+ String expected, boolean allowLeadingWildcard) throws Exception {
+ Analyzer a = qpAnalyzer;
+ SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a);
+ p.setLowercaseExpandedTerms(lowercase);
+ p.setAllowLeadingWildcard(allowLeadingWildcard);
+ Query q = p.parse(query);
+ assertMultitermEquals(q, expected);
+ }
+
+ private boolean isQueryParserException(Exception pe) {
+ if (pe instanceof ParseException){
+ return true;
+ }
+ return false;
+ }
+ public void testCJK() throws Exception {
+ // Test Ideographic Space - As wide as a CJK character cell (fullwidth)
+ // used google to translate the word "term" to japanese -> 用語
+ assertQueryEquals("term\u3000term\u3000term", null, "term\u0020term\u0020term");
+ assertQueryEquals("用語\u3000用語\u3000用語", null, "用語\u0020用語\u0020用語");
+ }
+
+ //individual CJK chars as terms, like StandardAnalyzer
+ protected static class SimpleCJKTokenizer extends Tokenizer {
+ private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ public SimpleCJKTokenizer(Reader input) {
+ super(input);
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ int ch = input.read();
+ if (ch < 0)
+ return false;
+ clearAttributes();
+ termAtt.setEmpty().append((char) ch);
+ return true;
+ }
+ }
+
+ private class SimpleCJKAnalyzer extends Analyzer {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new SimpleCJKTokenizer(reader));
+ }
+ }
+
+ public void testCJKTerm() throws Exception {
+ // individual CJK chars as terms
+ SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer();
+
+ SpanOrQuery expected = new SpanOrQuery(
+ new SpanQuery[]{
+ new SpanTermQuery(new Term(FIELD, "ä¸")),
+ new SpanTermQuery(new Term(FIELD, "国"))
+ });
+
+ assertEquals(expected, getQuery("ä¸å›½", analyzer));
+ }
+
+ public void testCJKBoostedTerm() throws Exception {
+ // individual CJK chars as terms
+ SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer();
+
+ SpanOrQuery expected = new SpanOrQuery(
+ new SpanQuery[]{
+ new SpanTermQuery(new Term(FIELD, "ä¸")),
+ new SpanTermQuery(new Term(FIELD, "国"))
+ });
+ expected.setBoost(0.5f);
+
+ assertEquals(expected, getQuery("ä¸å›½^0.5", analyzer));
+ }
+
+ public void testCJKPhrase() throws Exception {
+ // individual CJK chars as terms
+ SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer();
+
+ SpanNearQuery expected = new SpanNearQuery(
+ new SpanQuery[]{
+ new SpanTermQuery(new Term(FIELD, "ä¸")),
+ new SpanTermQuery(new Term(FIELD, "国"))
+ }, 0, true);
+
+ assertEquals(expected, getQuery("\"ä¸å›½\"", analyzer));
+ }
+
+ public void testCJKBoostedPhrase() throws Exception {
+ // individual CJK chars as terms
+ SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer();
+
+ SpanNearQuery expected = new SpanNearQuery(
+ new SpanQuery[]{
+ new SpanTermQuery(new Term(FIELD, "ä¸")),
+ new SpanTermQuery(new Term(FIELD, "国"))
+ }, 0, true);
+ expected.setBoost(0.5f);
+ assertEquals(expected, getQuery("\"ä¸å›½\"^0.5", analyzer));
+ }
+
+ public void testCJKSloppyPhrase() throws Exception {
+ // individual CJK chars as terms
+ SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer();
+
+ SpanNearQuery expected = new SpanNearQuery(
+ new SpanQuery[]{
+ new SpanTermQuery(new Term(FIELD, "ä¸")),
+ new SpanTermQuery(new Term(FIELD, "国"))
+ }, 3, false);
+
+ assertEquals(expected, getQuery("\"ä¸å›½\"~3", analyzer));
+ }
+
+
+ public void testAutoGeneratePhraseQueriesOn() throws Exception {
+ // individual CJK chars as terms
+ SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer();
+
+ SpanNearQuery expected = new SpanNearQuery(
+ new SpanTermQuery[]{
+ new SpanTermQuery(new Term(FIELD, "ä¸")),
+ new SpanTermQuery(new Term(FIELD, "国"))
+ }, 0, true);
+ CommonQueryParserConfiguration qp = getParserConfig(analyzer);
+ setAutoGeneratePhraseQueries(qp, true);
+ assertEquals(expected, getQuery("ä¸å›½",qp));
+ }
+
+
+
+ public void testSimple() throws Exception {
+ assertQueryEquals("term term term", null, "term term term");
+ assertQueryEquals("türm term term", new MockAnalyzer(random()), "türm term term");
+ assertQueryEquals("ümlaut", new MockAnalyzer(random()), "ümlaut");
+
+ // FIXME: enhance MockAnalyzer to be able to support this
+ // it must no longer extend CharTokenizer
+ //assertQueryEquals("\"\"", new KeywordAnalyzer(), "");
+ //assertQueryEquals("foo:\"\"", new KeywordAnalyzer(), "foo:");
+
+ assertQueryEquals("a AND b", null, "+a +b");
+ assertQueryEquals("(a AND b)", null, "+a +b");
+ assertQueryEquals("c (a AND b)", null, "c (+a +b)");
+ assertQueryEquals("a AND NOT b", null, "+a -b");
+ assertQueryEquals("a AND -b", null, "+a -b");
+
+ assertQueryEquals("a b", null, "a b");
+ assertQueryEquals("a -b", null, "a -b");
+
+ assertQueryEquals("+term -term term", null, "+term -term term");
+ assertQueryEquals("foo:term AND "+FIELD+":anotherTerm", null,
+ "+foo:term +anotherterm");
+ assertQueryEquals("term AND \"phrase phrase\"", null,
+ "+term +spanNear([spanOr([phrase1, phrase2]), "+
+ "spanOr([phrase1, phrase2])], 0, true)");
+ assertQueryEquals("\"hello there\"", null, "spanNear([hello, there], 0, true)");
+ assertTrue(getQuery("a AND b") instanceof BooleanQuery);
+ assertTrue(getQuery("hello") instanceof SpanTermQuery);
+ assertTrue(getQuery("\"hello there\"") instanceof SpanNearQuery);
+
+ assertQueryEquals("germ term^2.0", null, "germ term^2.0");
+ assertQueryEquals("(term)^2.0", null, "term^2.0");
+ assertQueryEquals("(germ term)^2.0", null, "(germ term)^2.0");
+ assertQueryEquals("term^2.0", null, "term^2.0");
+ assertQueryEquals("term^2", null, "term^2.0");
+ assertQueryEquals("\"germ term\"^2.0", null, "spanNear([germ, term], 0, true)^2.0");
+ assertQueryEquals("\"term germ\"^2", null, "spanNear([term, germ], 0, true)^2.0");
+
+ assertQueryEquals("(foo bar) AND (baz boo)", null,
+ "+(foo bar) +(baz boo)");
+ assertQueryEquals("((a b) AND NOT c) d", null,
+ "(+(a b) -c) d");
+ assertQueryEquals("+(apple \"steve jobs\") -(foo bar baz)", null,
+ "+(apple spanNear([steve, jobs], 0, true)) -(foo bar baz)");
+ assertQueryEquals("+title:(dog cat) -author:\"bob dole\"", null,
+ "+(title:dog title:cat) -spanNear([author:bob, author:dole], 0, true)");
+
+ }
+
+
+ public void testOperatorVsWhitespace() throws Exception { //LUCENE-2566
+ // +,-,! should be directly adjacent to operand (i.e. not separated by whitespace) to be treated as an operator
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
+ }
+ };
+ assertQueryEquals("a - b", a, "a - b");
+ assertQueryEquals("a + b", a, "a + b");
+ assertQueryEquals("a ! b", a, "a ! b");
+ }
+
+ public void testPunct() throws Exception {
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+ assertQueryEquals("a&b", a, "a&b");
+ assertQueryEquals("a&&b", a, "a&&b");
+ assertQueryEquals(".NET", a, ".NET");
+ }
+
+ public void testSlop() throws Exception {
+ assertQueryEquals("\"term germ\"~2", null, "spanNear([term, germ], 2, false)");
+ assertQueryEquals("\"term germ\"~2 flork", null, "spanNear([term, germ], 2, false) flork");
+ assertQueryEquals("\"term\"~2", null, "term");
+ assertQueryEquals("\" \"~2 germ", null, "germ");
+ assertQueryEquals("\"term germ\"~2^2", null, "spanNear([term, germ], 2, false)^2.0");
+ }
+
+ public void testNumber() throws Exception {
+ // The numbers go away because SimpleAnalzyer ignores them
+ assertQueryEquals("3", null, "spanOr([])");
+ assertQueryEquals("term 1.0 1 2", null, "term");
+ assertQueryEquals("term term1 term2", null, "term term term");
+
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true);
+ assertQueryEquals("3", a, "3");
+ assertQueryEquals("term 1.0 1 2", a, "term 1.0 1 2");
+ assertQueryEquals("term term1 term2", a, "term term1 term2");
+ }
+
+
+ public void testWildcard() throws Exception {
+ assertMultitermEquals("term*", "term*");
+
+ assertMultitermEquals("term*^2","term*", 2.0f);
+ assertMultitermEquals("term~", "term~2.0");
+ assertMultitermEquals("term~1", "term~1.0");
+ assertMultitermEquals("term~0.7","term~1.0");
+ assertMultitermEquals("term~^3", "term~2.0", 3.0f);
+ //not currently supported in SpanQueryParser
+ // assertWildcardQueryEquals("term^3~", "term~2.0", 3.0f);
+ assertMultitermEquals("term*germ", "term*germ");
+ assertMultitermEquals("term*germ^3", "term*germ", 3.0f);
+
+
+ PrefixQuery p = new PrefixQuery(new Term(FIELD, "term"));
+ SpanQuery wrapped = new SpanMultiTermQueryWrapper(p);
+ assertEquals(getQuery("term*"), wrapped);
+
+ p = new PrefixQuery(new Term(FIELD, "term"));
+ wrapped = new SpanMultiTermQueryWrapper(p);
+ Query parsed = getQuery("term*^2");
+ assertEquals(parsed, wrapped);
+ assertEquals(2.0f, parsed.getBoost(), 0.00001f);
+
+ FuzzyQuery f = new FuzzyQuery(new Term(FIELD, "term"), (int)2.0f);
+ wrapped = new SpanMultiTermQueryWrapper(f);
+
+ //not great test; better if we could retrieve wrapped query for testing.
+ //don't want to move these tests to SMTQW package.
+ assertTrue(getQuery("term~") instanceof SpanMultiTermQueryWrapper);
+ assertTrue(getQuery("term~0.7") instanceof SpanMultiTermQueryWrapper);
+ /*can't easily test this;
+ //FuzzyQuery fq = (FuzzyQuery)getQuery("term~0.7");
+ //assertEquals(1, fq.getMaxEdits());
+
+
+ assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
+ fq = (FuzzyQuery)getQuery("term~");
+ assertEquals(2, fq.getMaxEdits());
+ assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
+ */
+ //not true of SpanQueryParser...rounds value > 1
+ //assertParseException("term~1.1"); // value > 1, throws exception
+
+ assertTrue(getQuery("term*germ") instanceof SpanMultiTermQueryWrapper);
+
+ /* Tests to see that wild card terms are (or are not) properly
+ * lower-cased with propery parser configuration
+ */
+ // First prefix queries:
+ // by default, convert to lowercase:
+
+ assertMultitermEquals("Term*", true, "term*");
+ // explicitly set lowercase:
+ assertMultitermEquals("term*", true, "term*");
+ assertMultitermEquals("Term*", true, "term*");
+ assertMultitermEquals("TERM*", true, "term*");
+ // explicitly disable lowercase conversion:
+ assertMultitermEquals("term*", false, "term*");
+ assertMultitermEquals("Term*", false, "Term*");
+ assertMultitermEquals("TERM*", false, "TERM*");
+ // Then 'full' wildcard queries:
+ // by default, convert to lowercase:
+ assertMultitermEquals("Te?m", "te?m");
+ // explicitly set lowercase:
+ assertMultitermEquals("te?m", true, "te?m");
+ assertMultitermEquals("Te?m", true, "te?m");
+ assertMultitermEquals("TE?M", true, "te?m");
+ assertMultitermEquals("Te?m*gerM", true, "te?m*germ");
+ // explicitly disable lowercase conversion:
+ assertMultitermEquals("te?m", false, "te?m");
+ assertMultitermEquals("Te?m", false, "Te?m");
+ assertMultitermEquals("TE?M", false, "TE?M");
+ assertMultitermEquals("Te?m*gerM", false, "Te?m*gerM");
+ // Fuzzy queries:
+ assertMultitermEquals("Term~", "term~2.0");
+ assertMultitermEquals("Term~", true, "term~2.0");
+ assertMultitermEquals("Term~", false, "Term~2.0");
+ // Range queries:
+ assertMultitermEquals("[A TO C]", "[a TO c]");
+ assertMultitermEquals("[A TO C]", true, "[a TO c]");
+ assertMultitermEquals("[A TO C]", false, "[A TO C]");
+
+
+ // Test suffix queries: first disallow
+ try {
+ assertMultitermEquals("*Term", true, "*term");
+ } catch(Exception pe) {
+ // expected exception
+ if(!isQueryParserException(pe)){
+ fail();
+ }
+ }
+ try {
+ assertMultitermEquals("?Term", true, "?term");
+ fail();
+ } catch(Exception pe) {
+ // expected exception
+ if(!isQueryParserException(pe)){
+ fail();
+ }
+ }
+ // Test suffix queries: then allow
+ assertMultitermEquals("*Term", true, "*term", true);
+ assertMultitermEquals("?Term", true, "?term", true);
+ }
+
+
+
+
+
+
+
+ public void testLeadingWildcardType() throws Exception {
+ CommonQueryParserConfiguration cqpC = getParserConfig(null);
+ cqpC.setAllowLeadingWildcard(true);
+ assertEquals(SpanMultiTermQueryWrapper.class, getQuery("t*erm*",cqpC).getClass());
+ assertEquals(SpanMultiTermQueryWrapper.class, getQuery("?term*",cqpC).getClass());
+ assertEquals(SpanMultiTermQueryWrapper.class, getQuery("*term*",cqpC).getClass());
+ }
+
+ public void testQPA() throws Exception {
+ assertQueryEquals("term term^3.0 term", qpAnalyzer, "term term^3.0 term");
+ assertQueryEquals("term stop^3.0 term", qpAnalyzer, "term term");
+
+ assertQueryEquals("term term term", qpAnalyzer, "term term term");
+ assertQueryEquals("term +stop term", qpAnalyzer, "term term");
+ assertQueryEquals("term -stop term", qpAnalyzer, "term term");
+
+ assertQueryEquals("drop AND (stop) AND roll", qpAnalyzer, "+drop +roll");
+ assertQueryEquals("term +(stop) term", qpAnalyzer, "term term");
+ assertQueryEquals("term -(stop) term", qpAnalyzer, "term term");
+
+ assertQueryEquals("drop AND stop AND roll", qpAnalyzer, "+drop +roll");
+ assertQueryEquals("term phrase term", qpAnalyzer,
+ "term spanOr([phrase1, phrase2]) term");
+ assertQueryEquals("term AND NOT phrase term", qpAnalyzer,
+ "+term -spanOr([phrase1, phrase2]) term");
+ assertQueryEquals("stop^3", qpAnalyzer, "spanOr([])");
+ assertQueryEquals("stop", qpAnalyzer, "spanOr([])");
+ assertQueryEquals("(stop)^3", qpAnalyzer, "spanOr([])");
+ assertQueryEquals("((stop))^3", qpAnalyzer, "spanOr([])");
+ assertQueryEquals("(stop^3)", qpAnalyzer, "spanOr([])");
+ assertQueryEquals("((stop)^3)", qpAnalyzer, "spanOr([])");
+ assertQueryEquals("(stop)", qpAnalyzer, "spanOr([])");
+ assertQueryEquals("((stop))", qpAnalyzer, "spanOr([])");
+ assertTrue(getQuery("term term term", qpAnalyzer) instanceof BooleanQuery);
+ assertTrue(getQuery("term +stop", qpAnalyzer) instanceof SpanTermQuery);
+ }
+
+ public void testRange() throws Exception {
+ assertQueryEquals("[ a TO z]", null, "SpanMultiTermQueryWrapper([a TO z])");
+ assertQueryEquals("[ a TO z}", null, "SpanMultiTermQueryWrapper([a TO z})");
+ assertQueryEquals("{ a TO z]", null, "SpanMultiTermQueryWrapper({a TO z])");
+ assertQueryEquals("{ a TO z}", null, "SpanMultiTermQueryWrapper({a TO z})");
+
+ //SQP:not sure what this should be
+ // assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT,
+ // ((SpanMultiTermQueryWrapper)getQuery("[ a TO z]")).getRewriteMethod());
+ //TODO: turn back on
+ /*
+ CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
+
+ qp.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
+ assertEquals(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE,((TermRangeQuery)getQuery("[ a TO z]", qp)).getRewriteMethod());
+
+ // test open ranges
+ assertQueryEquals("[ a TO * ]", null, "[a TO *]");
+ assertQueryEquals("[ * TO z ]", null, "[* TO z]");
+ assertQueryEquals("[ * TO * ]", null, "[* TO *]");
+ */
+ // mixing exclude and include bounds
+ assertQueryEquals("{ a TO z ]", null, "SpanMultiTermQueryWrapper({a TO z])");
+ assertQueryEquals("[ a TO z }", null, "SpanMultiTermQueryWrapper([a TO z})");
+ assertQueryEquals("{ a TO * ]", null, "SpanMultiTermQueryWrapper({a TO \\*])");
+ assertQueryEquals("[ * TO z }", null, "SpanMultiTermQueryWrapper([\\* TO z})");
+
+ assertQueryEquals("[ a TO z ]", null, "SpanMultiTermQueryWrapper([a TO z])");
+ assertQueryEquals("{ a TO z}", null, "SpanMultiTermQueryWrapper({a TO z})");
+ assertQueryEquals("{ a TO z }", null, "SpanMultiTermQueryWrapper({a TO z})");
+ assertQueryEquals("{ a TO z }^2.0", null, "SpanMultiTermQueryWrapper({a TO z})");
+ assertBoostEquals("{ a TO z }^2.0", 2.0f);
+ assertQueryEquals("[ a TO z] OR bar", null, "SpanMultiTermQueryWrapper([a TO z]) bar");
+ assertQueryEquals("[ a TO z] AND bar", null, "+SpanMultiTermQueryWrapper([a TO z]) +bar");
+ assertQueryEquals("( bar blar { a TO z}) ", null, "bar blar SpanMultiTermQueryWrapper({a TO z})");
+ assertQueryEquals("gack ( bar blar { a TO z}) ", null, "gack (bar blar SpanMultiTermQueryWrapper({a TO z}))");
+
+ assertQueryEquals("[* TO Z]",null,"SpanMultiTermQueryWrapper([\\* TO z])");
+ assertQueryEquals("[A TO *]",null,"SpanMultiTermQueryWrapper([a TO \\*])");
+ assertQueryEquals("[* TO *]",null,"SpanMultiTermQueryWrapper([\\* TO \\*])");
+ }
+
+ public void testRangeWithPhrase() throws Exception {
+ //different behavior than classic
+ // assertQueryEquals("[\\* TO \"*\"]",null,"[\\* TO \\*]");
+ // assertQueryEquals("[\"*\" TO *]",null,"[\\* TO *]");
+
+ assertQueryEquals("[\\* TO \"*\"]",null,"SpanMultiTermQueryWrapper([\\* TO \"*\"])");
+ assertQueryEquals("[\"*\" TO *]",null,"SpanMultiTermQueryWrapper([\"*\" TO \\*])");
+
+ }
+
+ private String escapeDateString(String s) {
+ if (s.indexOf(" ") > -1) {
+ return "\"" + s + "\"";
+ } else {
+ return s;
+ }
+ }
+
+ /** for testing DateTools support */
+ private String getDate(String s, DateTools.Resolution resolution) throws Exception {
+ // we use the default Locale since LuceneTestCase randomizes it
+ DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, Locale.getDefault());
+ return getDate(df.parse(s), resolution);
+ }
+
+ /** for testing DateTools support */
+ private String getDate(Date d, DateTools.Resolution resolution) {
+ return DateTools.dateToString(d, resolution);
+ }
+
+ private String getLocalizedDate(int year, int month, int day) {
+ // we use the default Locale/TZ since LuceneTestCase randomizes it
+ DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, Locale.getDefault());
+ Calendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.getDefault());
+ calendar.clear();
+ calendar.set(year, month, day);
+ calendar.set(Calendar.HOUR_OF_DAY, 23);
+ calendar.set(Calendar.MINUTE, 59);
+ calendar.set(Calendar.SECOND, 59);
+ calendar.set(Calendar.MILLISECOND, 999);
+ return df.format(calendar.getTime());
+ }
+
+ public void testDateRange() throws Exception {
+ String startDate = getLocalizedDate(2002, 1, 1);
+ String endDate = getLocalizedDate(2002, 1, 4);
+ // we use the default Locale/TZ since LuceneTestCase randomizes it
+ Calendar endDateExpected = new GregorianCalendar(TimeZone.getDefault(), Locale.getDefault());
+ endDateExpected.clear();
+ endDateExpected.set(2002, 1, 4, 23, 59, 59);
+ endDateExpected.set(Calendar.MILLISECOND, 999);
+ final String defaultField = "default";
+ final String monthField = "month";
+ final String hourField = "hour";
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ CommonQueryParserConfiguration qp = getParserConfig(a);
+
+ // set a field specific date resolution
+ setDateResolution(qp, monthField, DateTools.Resolution.MONTH);
+
+ // set default date resolution to MILLISECOND
+ qp.setDateResolution(DateTools.Resolution.MILLISECOND);
+
+ // set second field specific date resolution
+ setDateResolution(qp, hourField, DateTools.Resolution.HOUR);
+
+
+ // for this field no field specific date resolution has been set,
+ // so verify if the default resolution is used
+ assertDateRangeQueryEquals(qp, defaultField, startDate, endDate,
+ endDateExpected.getTime(), DateTools.Resolution.MILLISECOND);
+
+ // verify if field specific date resolutions are used for these two fields
+ assertDateRangeQueryEquals(qp, monthField, startDate, endDate,
+ endDateExpected.getTime(), DateTools.Resolution.MONTH);
+
+ assertDateRangeQueryEquals(qp, hourField, startDate, endDate,
+ endDateExpected.getTime(), DateTools.Resolution.HOUR);
+ }
+
+ public void assertDateRangeQueryEquals(CommonQueryParserConfiguration cqpC, String field, String startDate, String endDate,
+ Date endDateInclusive, DateTools.Resolution resolution) throws Exception {
+
+ assertQueryEquals(cqpC, field, field + ":[" + escapeDateString(startDate) + " TO " + escapeDateString(endDate) + "]",
+ "SpanMultiTermQueryWrapper([" + getDate(startDate, resolution) + " TO " + getDate(endDateInclusive, resolution) + "])");
+
+ assertQueryEquals(cqpC, field, field + ":{" + escapeDateString(startDate) + " TO " + escapeDateString(endDate) + "}",
+ "SpanMultiTermQueryWrapper({" + getDate(startDate, resolution) + " TO " + getDate(endDate, resolution) + "})");
+ }
+
+
+
+
+ @Test
+ public void testEscaped() throws Exception {
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+ //commented out in QueryParserTestBase
+ /* assertQueryEquals("\\[brackets", a, "\\[brackets");
+ assertQueryEquals("\\[brackets", null, "brackets");
+ assertQueryEquals("\\\\", a, "\\\\");
+ assertQueryEquals("\\+blah", a, "\\+blah");
+ assertQueryEquals("\\(blah", a, "\\(blah");
+
+ assertQueryEquals("\\-blah", a, "\\-blah");
+ assertQueryEquals("\\!blah", a, "\\!blah");
+ assertQueryEquals("\\{blah", a, "\\{blah");
+ assertQueryEquals("\\}blah", a, "\\}blah");
+ assertQueryEquals("\\:blah", a, "\\:blah");
+ assertQueryEquals("\\^blah", a, "\\^blah");
+ assertQueryEquals("\\[blah", a, "\\[blah");
+ assertQueryEquals("\\]blah", a, "\\]blah");
+ assertQueryEquals("\\\"blah", a, "\\\"blah");
+ assertQueryEquals("\\(blah", a, "\\(blah");
+ assertQueryEquals("\\)blah", a, "\\)blah");
+ assertQueryEquals("\\~blah", a, "\\~blah");
+ assertQueryEquals("\\*blah", a, "\\*blah");
+ assertQueryEquals("\\?blah", a, "\\?blah");*/
+ //assertQueryEquals("foo \\&\\& bar", a, "foo \\&\\& bar");
+ //assertQueryEquals("foo \\|| bar", a, "foo \\|| bar");
+ //assertQueryEquals("foo \\AND bar", a, "foo \\AND bar");
+
+ assertQueryEquals("\\a", a, "a");
+
+ assertQueryEquals("a\\-b:c", a, "a-b:c");
+ assertQueryEquals("a\\+b:c", a, "a+b:c");
+ assertQueryEquals("a\\:b:c", a, "a:b:c");
+ assertQueryEquals("a\\\\b:c", a, "a\\b:c");
+
+ assertQueryEquals("a:b\\-c", a, "a:b-c");
+ assertQueryEquals("a:b\\+c", a, "a:b+c");
+ assertQueryEquals("a:b\\:c", a, "a:b:c");
+ assertQueryEquals("a:b\\\\c", a, "a:b\\c");
+
+ assertMultitermEquals("a", "a:b\\-c*", a, "b-c*");
+ assertMultitermEquals("a", "a:b\\+c*", a, "b+c*");
+ assertMultitermEquals("a", "a:b\\:c*", a, "b:c*");
+
+ assertMultitermEquals("a", "a:b\\\\c*", a, "b\\c*");
+
+ assertMultitermEquals("a", "a:b\\-c~", a, "b-c~2.0");
+ assertMultitermEquals("a", "a:b\\+c~", a, "b+c~2.0");
+ assertMultitermEquals("a", "a:b\\:c~", a, "b:c~2.0");
+ assertMultitermEquals("a", "a:b\\\\c~", a, "b\\c~2.0");
+
+ assertMultitermEquals("[ a\\- TO a\\+ ]", "[a- TO a+]");
+ assertMultitermEquals("[ a\\: TO a\\~ ]", "[a: TO a~]");
+ assertMultitermEquals("[ a\\\\ TO a\\* ]", "[a\\ TO a*]");
+
+ assertMultitermEquals("[\"c\\:\\\\temp\\\\\\~foo0.txt\" TO \"c\\:\\\\temp\\\\\\~foo9.txt\"]", a,
+ "[\"c:\\temp\\~foo0.txt\" TO \"c:\\temp\\~foo9.txt\"]");
+ //different behavior than classic: doesn't trim leading and trailing quotes
+ // "[c:\\temp\\~foo0.txt TO c:\\temp\\~foo9.txt]");
+
+ assertQueryEquals("a\\\\\\+b", a, "a\\+b");
+
+ assertQueryEquals("a \\\"b c\\\" d", a, "a \"b c\" d");
+ assertQueryEquals("\"a \\\"b c\\\" d\"", a, "spanNear([a, \"b, c\", d], 0, true)");
+ assertQueryEquals("\"a \\+b c d\"", a, "spanNear([a, +b, c, d], 0, true)");
+
+ assertQueryEquals("c\\:\\\\temp\\\\\\~foo.txt", a, "c:\\temp\\~foo.txt");
+
+ assertParseException("XY\\"); // there must be a character after the escape char
+
+ // test unicode escaping
+ assertQueryEquals("a\\u0062c", a, "abc");
+ assertQueryEquals("XY\\u005a", a, "XYZ");
+ assertQueryEquals("XY\\u005A", a, "XYZ");
+ assertQueryEquals("\"a \\\\\\u0028\\u0062\\\" c\"", a, "spanNear([a, \\(b\", c], 0, true)");
+
+ assertParseException("XY\\u005G"); // test non-hex character in escaped unicode sequence
+ assertParseException("XY\\u005"); // test incomplete escaped unicode sequence
+
+ // Tests bug LUCENE-800
+ assertQueryEquals("(item:\\\\ item:ABCD\\\\)", a, "item:\\ item:ABCD\\");
+ assertParseException("(item:\\\\ item:ABCD\\\\))"); // unmatched closing paranthesis
+ assertQueryEquals("\\*", a, "*");
+ assertQueryEquals("\\\\", a, "\\"); // escaped backslash
+
+ assertParseException("\\"); // a backslash must always be escaped
+
+ // LUCENE-1189
+ assertQueryEquals("(\"a\\\\\") or (\"b\")", a ,"a\\ or b");
+
+ //fails actual LUCENE-1189 test, but so does classic query parser
+ //assertQueryEquals("(name:\"///mike\\\\\\\") or (name:\"alphonse\")", a,
+ // "name:///mike\\\\\\ or alphonse");
+ }
+
+
+ public void testEscapedVsQuestionMarkAsWildcard() throws Exception {
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+ //SpanMultiTermQueryWrapper(a:b-?c)
+ assertMultitermEquals("a", "a:b\\-?c", a, "b\\-?c");
+ assertMultitermEquals("a", "a:b\\+?c", a, "b\\+?c");
+ assertMultitermEquals("a", "a:b\\:?c", a, "b\\:?c");
+
+ assertMultitermEquals("a", "a:b\\\\?c", a, "b\\\\?c");
+ }
+
+ public void testQueryStringEscaping() throws Exception {
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+
+ assertEscapedQueryEquals("a-b:c", a, "a\\-b\\:c");
+ assertEscapedQueryEquals("a+b:c", a, "a\\+b\\:c");
+ assertEscapedQueryEquals("a:b:c", a, "a\\:b\\:c");
+ assertEscapedQueryEquals("a\\b:c", a, "a\\\\b\\:c");
+
+ assertEscapedQueryEquals("a:b-c", a, "a\\:b\\-c");
+ assertEscapedQueryEquals("a:b+c", a, "a\\:b\\+c");
+ assertEscapedQueryEquals("a:b:c", a, "a\\:b\\:c");
+ assertEscapedQueryEquals("a:b\\c", a, "a\\:b\\\\c");
+
+ assertEscapedQueryEquals("a:b-c*", a, "a\\:b\\-c\\*");
+ assertEscapedQueryEquals("a:b+c*", a, "a\\:b\\+c\\*");
+ assertEscapedQueryEquals("a:b:c*", a, "a\\:b\\:c\\*");
+
+ assertEscapedQueryEquals("a:b\\\\c*", a, "a\\:b\\\\\\\\c\\*");
+
+ assertEscapedQueryEquals("a:b-?c", a, "a\\:b\\-\\?c");
+ assertEscapedQueryEquals("a:b+?c", a, "a\\:b\\+\\?c");
+ assertEscapedQueryEquals("a:b:?c", a, "a\\:b\\:\\?c");
+
+ assertEscapedQueryEquals("a:b?c", a, "a\\:b\\?c");
+
+ assertEscapedQueryEquals("a:b-c~", a, "a\\:b\\-c\\~");
+ assertEscapedQueryEquals("a:b+c~", a, "a\\:b\\+c\\~");
+ assertEscapedQueryEquals("a:b:c~", a, "a\\:b\\:c\\~");
+ assertEscapedQueryEquals("a:b\\c~", a, "a\\:b\\\\c\\~");
+
+ assertEscapedQueryEquals("[ a - TO a+ ]", null, "\\[ a \\- TO a\\+ \\]");
+ assertEscapedQueryEquals("[ a : TO a~ ]", null, "\\[ a \\: TO a\\~ \\]");
+ assertEscapedQueryEquals("[ a\\ TO a* ]", null, "\\[ a\\\\ TO a\\* \\]");
+
+ // LUCENE-881
+ assertEscapedQueryEquals("|| abc ||", a, "\\|\\| abc \\|\\|");
+ assertEscapedQueryEquals("&& abc &&", a, "\\&\\& abc \\&\\&");
+ }
+
+ public void testTabNewlineCarriageReturn()
+ throws Exception {
+ assertQueryEqualsDOA("+weltbank +worlbank", null,
+ "+weltbank +worlbank");
+
+ assertQueryEqualsDOA("+weltbank\n+worlbank", null,
+ "+weltbank +worlbank");
+ assertQueryEqualsDOA("weltbank \n+worlbank", null,
+ "+weltbank +worlbank");
+ assertQueryEqualsDOA("weltbank \n +worlbank", null,
+ "+weltbank +worlbank");
+
+ assertQueryEqualsDOA("+weltbank\r+worlbank", null,
+ "+weltbank +worlbank");
+ assertQueryEqualsDOA("weltbank \r+worlbank", null,
+ "+weltbank +worlbank");
+ assertQueryEqualsDOA("weltbank \r +worlbank", null,
+ "+weltbank +worlbank");
+
+ assertQueryEqualsDOA("+weltbank\r\n+worlbank", null,
+ "+weltbank +worlbank");
+ assertQueryEqualsDOA("weltbank \r\n+worlbank", null,
+ "+weltbank +worlbank");
+ assertQueryEqualsDOA("weltbank \r\n +worlbank", null,
+ "+weltbank +worlbank");
+ assertQueryEqualsDOA("weltbank \r \n +worlbank", null,
+ "+weltbank +worlbank");
+
+ assertQueryEqualsDOA("+weltbank\t+worlbank", null,
+ "+weltbank +worlbank");
+ assertQueryEqualsDOA("weltbank \t+worlbank", null,
+ "+weltbank +worlbank");
+ assertQueryEqualsDOA("weltbank \t +worlbank", null,
+ "+weltbank +worlbank");
+ }
+
+ public void testSimpleDAO()
+ throws Exception {
+ assertQueryEqualsDOA("term term term", null, "+term +term +term");
+ assertQueryEqualsDOA("term +term term", null, "+term +term +term");
+ assertQueryEqualsDOA("term term +term", null, "+term +term +term");
+ assertQueryEqualsDOA("term +term +term", null, "+term +term +term");
+ assertQueryEqualsDOA("-term term term", null, "-term +term +term");
+ }
+
+ public void testBoost()
+ throws Exception {
+ CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
+ Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
+ CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer);
+ Query q = getQuery("on^1.0",qp);
+ assertNotNull(q);
+ q = getQuery("\"hello\"^2.0",qp);
+ assertNotNull(q);
+ assertEquals(q.getBoost(), (float) 2.0, (float) 0.01);
+ q = getQuery("hello^2.0",qp);
+ assertNotNull(q);
+ assertEquals(q.getBoost(), (float) 2.0, (float) 0.01);
+ q = getQuery("\"on\"^1.0",qp);
+ assertNotNull(q);
+
+ Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
+ CommonQueryParserConfiguration qp2 = getParserConfig(a2);
+ q = getQuery("the^3", qp2);
+ // "the" is a stop word so the result is an empty query:
+ assertNotNull(q);
+ assertEquals("spanOr([])", q.toString());
+ assertEquals(1.0f, q.getBoost(), 0.01f);
+ }
+
+ public void assertParseException(String queryString) throws Exception {
+ try {
+ getQuery(queryString);
+ } catch (Exception expected) {
+ if(isQueryParserException(expected)){
+ return;
+ }
+ }
+ fail("ParseException expected, not thrown");
+ }
+
+ public void assertParseException(String queryString, Analyzer a) throws Exception {
+ try {
+ getQuery(queryString, a);
+ } catch (Exception expected) {
+ if(isQueryParserException(expected)){
+ return;
+ }
+ }
+ fail("ParseException expected, not thrown");
+ }
+
+ public void testException() throws Exception {
+ assertParseException("\"some phrase");
+ assertParseException("(foo bar");
+ assertParseException("foo bar))");
+ assertParseException("field:term:with:colon some more terms");
+ assertParseException("(sub query)^5.0^2.0 plus more");
+ assertParseException("secret AND illegal) AND access:confidential");
+ }
+
+ public void testBooleanQuery() throws Exception {
+ BooleanQuery.setMaxClauseCount(2);
+ Analyzer purWhitespaceAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+ assertParseException("one two three", purWhitespaceAnalyzer);
+ }
+
+ /**
+ * This test differs from TestPrecedenceQueryParser
+ */
+ public void testPrecedence() throws Exception {
+ CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
+ Query query1 = getQuery("A AND B OR C AND D", qp);
+ Query query2 = getQuery("+A +B +C +D", qp);
+ assertEquals(query1, query2);
+ }
+
+ // Todo: convert this from DateField to DateUtil
+ // public void testLocalDateFormat() throws IOException, ParseException {
+ // Directory ramDir = newDirectory();
+ // IndexWriter iw = new IndexWriter(ramDir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
+ // addDateDoc("a", 2005, 12, 2, 10, 15, 33, iw);
+ // addDateDoc("b", 2005, 12, 4, 22, 15, 00, iw);
+ // iw.close();
+ // IndexSearcher is = new IndexSearcher(ramDir, true);
+ // assertHits(1, "[12/1/2005 TO 12/3/2005]", is);
+ // assertHits(2, "[12/1/2005 TO 12/4/2005]", is);
+ // assertHits(1, "[12/3/2005 TO 12/4/2005]", is);
+ // assertHits(1, "{12/1/2005 TO 12/3/2005}", is);
+ // assertHits(1, "{12/1/2005 TO 12/4/2005}", is);
+ // assertHits(0, "{12/3/2005 TO 12/4/2005}", is);
+ // is.close();
+ // ramDir.close();
+ // }
+ //
+ // private void addDateDoc(String content, int year, int month,
+ // int day, int hour, int minute, int second, IndexWriter iw) throws IOException {
+ // Document d = new Document();
+ // d.add(newField(FIELD, content, Field.Store.YES, Field.Index.ANALYZED));
+ // Calendar cal = Calendar.getInstance(Locale.ENGLISH);
+ // cal.set(year, month - 1, day, hour, minute, second);
+ // d.add(newField("date", DateField.dateToString(cal.getTime()), Field.Store.YES, Field.Index.NOT_ANALYZED));
+ // iw.addDocument(d);
+ // }
+
+ // public abstract void testStarParsing() throws Exception;
+
+ public void testEscapedWildcard() throws Exception {
+ CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
+ WildcardQuery q = new WildcardQuery(new Term(FIELD, "foo\\?ba?r"));
+ SpanMultiTermQueryWrapper wq = new SpanMultiTermQueryWrapper(q);
+ assertEquals(wq, getQuery("foo\\?ba?r", qp));
+ }
+
+ public void testRegexps() throws Exception {
+ CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
+ RegexpQuery q = new RegexpQuery(new Term(FIELD, "[a-z][123]"));
+ assertEqualsWrappedRegexp(q, getQuery("/[a-z][123]/",qp));
+ qp.setLowercaseExpandedTerms(true);
+ assertEqualsWrappedRegexp(q, getQuery("/[A-Z][123]/",qp));
+ q.setBoost(0.5f);
+ //assertEqualsWrappedRegexp(q, getQuery("/[A-Z][123]/^0.5",qp));
+ assertBoostEquals("/[A-Z][123]/^0.5", 0.5f);
+ qp.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
+ q.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
+ assertTrue(getQuery("/[A-Z][123]/^0.5",qp) instanceof SpanMultiTermQueryWrapper);
+ // assertEquals(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE,
+ // ((SpanMultiTermQueryWrapper)getQuery("/[A-Z][123]/^0.5",qp)).getRewriteMethod());
+ // assertEqualsWrappedRegexp(q, getQuery("/[A-Z][123]/^0.5",qp));
+ assertBoostEquals("/[A-Z][123]/^0.5", 0.5f);
+
+ qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
+
+ SpanMultiTermQueryWrapper escaped =
+ //SQP changed [a-z]\\/[123] to [a-z]/[123]
+ new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "[a-z]/[123]")));
+
+ assertEquals(escaped, getQuery("/[a-z]\\/[123]/",qp));
+ SpanMultiTermQueryWrapper escaped2 =
+ new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "[a-z]\\*[123]")));
+ assertEquals(escaped2, getQuery("/[a-z]\\*[123]/",qp));
+
+ BooleanQuery complex = new BooleanQuery();
+ complex.add(new SpanMultiTermQueryWrapper(
+ new RegexpQuery(new Term(FIELD, "[a-z]/[123]"))), Occur.MUST);
+ complex.add(new SpanTermQuery(new Term("path", "/etc/init.d/")), Occur.MUST);
+ complex.add(new SpanTermQuery(new Term(FIELD, "/etc/init[.]d/lucene/")), Occur.SHOULD);
+ // assertEquals(complex, getQuery("/[a-z]\\/[123]/ AND path:\"/etc/init.d/\" OR \"/etc\\/init\\[.\\]d/lucene/\" ",qp));
+ assertEquals(complex, getQuery("/[a-z]\\/[123]/ AND path:\\/etc\\/init.d\\/ OR \\/etc\\/init\\[.\\]d/lucene\\/ ",qp));
+
+ Query re = new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "http.*")));
+ assertEquals(re, getQuery(FIELD+":/http.*/",qp));
+ assertEquals(re, getQuery("/http.*/",qp));
+
+ re = new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "http~0.5")));
+ assertEquals(re, getQuery(FIELD+":/http~0.5/",qp));
+ assertEquals(re, getQuery("/http~0.5/",qp));
+
+ re = new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "boo")));
+ assertEquals(re, getQuery(FIELD+":/boo/",qp));
+ assertEquals(re, getQuery("/boo/",qp));
+
+ // assertEquals(new SpanTermQuery(new Term(FIELD, "/boo/")), getQuery("\"/boo/\"",qp));
+ assertEquals(new SpanTermQuery(new Term(FIELD, "/boo/")), getQuery("\\/boo\\/",qp));
+
+ BooleanQuery two = new BooleanQuery();
+ two.add(new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "foo"))), Occur.SHOULD);
+ two.add(new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "bar"))), Occur.SHOULD);
+ assertEquals(two, getQuery(FIELD+":/foo/ "+FIELD+":/bar/",qp));
+ assertEquals(two, getQuery("/foo/ /bar/",qp));
+ }
+
+
+ public void testStopwords() throws Exception {
+ CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
+ CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
+ Query result = getQuery("field:the OR field:foo",qp);
+ assertNotNull("result is null and it shouldn't be", result);
+ assertTrue("result is not a BooleanQuery", result instanceof SpanOrQuery);
+ assertEquals(0, ((SpanOrQuery)result).getClauses().length);
+ result = getQuery("field:woo OR field:the",qp);
+ assertNotNull("result is null and it shouldn't be", result);
+ assertTrue("result is not a TermQuery", result instanceof SpanTermQuery);
+ result = getQuery("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)",qp);
+ assertNotNull("result is null and it shouldn't be", result);
+ assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
+ if (VERBOSE) System.out.println("Result: " + result);
+ assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 2, ((BooleanQuery) result).clauses().size() == 2);
+ }
+
+ public void testPositionIncrement() throws Exception {
+ CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
+ qp.setEnablePositionIncrements(true);
+ String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
+ // 0 2 5 7 8
+ SpanNearQuery pq = (SpanNearQuery) getQuery(qtxt,qp);
+ //System.out.println("Query text: "+qtxt);
+ //System.out.println("Result: "+pq);
+ SpanQuery[] clauses = pq.getClauses();
+ assertEquals(clauses.length, 5);
+ Set expected = new HashSet();
+ expected.add(new Term(FIELD, "words"));
+ expected.add(new Term(FIELD, "poisitions"));
+ expected.add(new Term(FIELD, "pos"));
+ expected.add(new Term(FIELD, "stopped"));
+ expected.add(new Term(FIELD, "phrasequery"));
+
+ Set terms = new HashSet();
+ for (int i = 0; i < clauses.length; i++){
+ SpanQuery q = clauses[i];
+ q.extractTerms(terms);
+ }
+ assertEquals(expected, terms);
+ }
+
+ public void testMatchAllDocs() throws Exception {
+ CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
+ assertEquals(new MatchAllDocsQuery(), getQuery("*:*",qp));
+ assertEquals(new MatchAllDocsQuery(), getQuery("(*:*)",qp));
+ BooleanQuery bq = (BooleanQuery)getQuery("+*:* -*:*",qp);
+ assertTrue(bq.getClauses()[0].getQuery() instanceof MatchAllDocsQuery);
+ assertTrue(bq.getClauses()[1].getQuery() instanceof MatchAllDocsQuery);
+ }
+
+
+ @Override
+ public void tearDown() throws Exception {
+ BooleanQuery.setMaxClauseCount(originalMaxClauses);
+ super.tearDown();
+ }
+
+ public Query getQueryDOA(String query, Analyzer a)
+ throws Exception {
+ if (a == null)
+ a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ CommonQueryParserConfiguration qp = getParserConfig(a);
+ setDefaultOperatorAND(qp);
+ return getQuery(query, qp);
+ }
+
+ private void setDefaultOperatorAND(CommonQueryParserConfiguration qp) {
+ ((SpanQueryParser)qp).setDefaultOperator(Operator.AND);
+ }
+
+
+ public void assertQueryEqualsDOA(String query, Analyzer a, String result)
+ throws Exception {
+ Query q = getQueryDOA(query, a);
+ String s = q.toString(FIELD);
+ if (!s.equals(result)) {
+ fail("Query /" + query + "/ yielded /" + s
+ + "/, expecting /" + result + "/");
+ }
+ }
+}
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestAdvancedAnalyzers.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestAdvancedAnalyzers.java (revision 0)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestAdvancedAnalyzers.java (revision 0)
@@ -0,0 +1,568 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.queryparser.spans;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReaderContext;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.queryparser.spans.AnalyzingQueryParserBase.NORM_MULTI_TERMS;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TotalHitCountCollector;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.util._TestUtil;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestAdvancedAnalyzers extends LuceneTestCase {
+
+ private static IndexReader reader;
+ private static IndexSearcher searcher;
+ private static Directory directory;
+ private static Analyzer synAnalyzer;
+ private static Analyzer baseAnalyzer;
+ private static Analyzer ucVowelAnalyzer;
+ private static final String FIELD1 = "f1";
+ private static final String FIELD2 = "f2";
+ private static final String FIELD3 = "f3";
+ private static final String FIELD4 = "f4";
+
+
+ // private static final CharacterRunAutomaton STOP_WORDS = new CharacterRunAutomaton(
+ // BasicOperations.union(Arrays.asList(makeString("a"), makeString("an"))));
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+
+ synAnalyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE,
+ true);
+ TokenFilter filter = new MockNonWhitespaceFilter(tokenizer);
+
+ filter = new MockSynFilter(filter);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+
+ baseAnalyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE,
+ true);
+ TokenFilter filter = new MockNonWhitespaceFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+
+ ucVowelAnalyzer = new Analyzer(){
+ @Override
+ public TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE,
+ true);
+ TokenFilter filter = new MockUCVowelFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+ Analyzer tmpUCVowelAnalyzer = new Analyzer(){
+ @Override
+ public TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE,
+ true);
+ TokenFilter filter = new MockUCVowelFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+ directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, baseAnalyzer)
+ .setMaxBufferedDocs(_TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy()));
+ String[] docs = new String[] {
+ "abc_def",
+ "lmnop",
+ "abc",
+ "qrs tuv",
+ "qrs_tuv"
+ };
+ for (int i = 0; i < docs.length; i++) {
+ Document doc = new Document();
+ doc.add(newTextField(FIELD1, docs[i], Field.Store.YES));
+ TextField tf = new TextField(FIELD2, docs[i], Field.Store.YES);
+ tf.setTokenStream(ucVowelAnalyzer.tokenStream(FIELD2, docs[i]));
+ doc.add(tf);
+ doc.add(newTextField(FIELD3, docs[i], Field.Store.YES));
+
+ TextField tf4 = new TextField(FIELD4, docs[i], Field.Store.YES);
+ tf4.setTokenStream(tmpUCVowelAnalyzer.tokenStream(FIELD4, docs[i]));
+ doc.add(tf4);
+ writer.addDocument(doc);
+ }
+ reader = writer.getReader();
+ searcher = new IndexSearcher(reader);
+ writer.close();
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ reader.close();
+ directory.close();
+ reader = null;
+ directory = null;
+ synAnalyzer = null;
+ baseAnalyzer = null;
+ }
+
+ @Test
+ public void testSynBasic() throws Exception {
+
+ SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, synAnalyzer);
+ countSpansDocs(p, "tuv", 2, 2);
+
+ countSpansDocs(p, "abc", 6, 4);
+
+ }
+
+ @Test
+ public void testNonWhiteSpace() throws Exception {
+ SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT,FIELD1, baseAnalyzer);
+ String s = "[zqx_qrs^3.0]~3^2";
+ Query q = p.parse(s);
+ assertTrue(q instanceof SpanNearQuery);
+
+ SpanNearQuery near = (SpanNearQuery)q;
+ SpanQuery[] clauses = near.getClauses();
+ assertEquals(2, clauses.length);
+
+ assertEquals(3, near.getSlop());
+ assertTrue(clauses[0] instanceof SpanTermQuery);
+ assertTrue(clauses[1] instanceof SpanTermQuery);
+
+ assertEquals("zqx", ((SpanTermQuery)clauses[0]).getTerm().text());
+ assertEquals("qrs", ((SpanTermQuery)clauses[1]).getTerm().text());
+
+ //take the boost from the phrase, ignore boost on term
+ //not necessarily right choice, but this is how it works now
+ assertEquals(2.0f, q.getBoost(), 0.00001f);
+
+ s = "[zqx2_qrs3 lmnop]~3";
+ p.setAutoGeneratePhraseQueries(true);
+ q = p.parse(s);
+ assertTrue(q instanceof SpanQuery);
+ assertTrue(q instanceof SpanNearQuery);
+ near = (SpanNearQuery)q;
+ clauses = near.getClauses();
+ assertEquals(2, clauses.length);
+
+ assertEquals(3, near.getSlop());
+ assertTrue(clauses[0] instanceof SpanNearQuery);
+ assertTrue(clauses[1] instanceof SpanTermQuery);
+
+ SpanNearQuery child = (SpanNearQuery)clauses[0];
+ SpanQuery[] childClauses = child.getClauses();
+ assertEquals(2, childClauses.length);
+
+ assertEquals("zqx", ((SpanTermQuery)childClauses[0]).getTerm().text());
+ assertEquals("qrs", ((SpanTermQuery)childClauses[1]).getTerm().text());
+
+ assertTrue(child.isInOrder());
+ assertEquals(child.getSlop(), 0);
+ }
+
+ //test different initializations/settings with multifield analyzers
+ @Test
+ public void testAnalyzerCombos() throws Exception{
+ //wt = whole term
+ Map wt = new HashMap();
+ Map mt = new HashMap();
+
+
+ //basic, correct set up
+ SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, baseAnalyzer);
+ assertEquals(1, countDocs((SpanQuery)p.parse("lmnop")));
+ assertEquals(1, countDocs((SpanQuery)p.parse("lm*op")));
+ assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP")));
+ assertEquals(1, countDocs((SpanQuery)p.parse("LM*OP")));
+ assertEquals(NORM_MULTI_TERMS.LOWERCASE, p.getNormMultiTerms());
+
+
+
+ //basic, correct set up
+ p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD2, ucVowelAnalyzer);
+ assertEquals(NORM_MULTI_TERMS.LOWERCASE, p.getNormMultiTerms());
+ assertEquals(1, countDocs((SpanQuery)p.parse("lmnop")));
+ assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP")));
+ assertEquals(0, countDocs((SpanQuery)p.parse("LM*OP")));
+
+ //set to lowercase only, won't analyze
+ assertEquals(0, countDocs((SpanQuery)p.parse("lm*op")));
+ p.setNormMultiTerms(NORM_MULTI_TERMS.ANALYZE);
+ assertEquals(1, countDocs((SpanQuery)p.parse("lm*op")));
+ assertEquals(1, countDocs((SpanQuery)p.parse("LM*OP")));
+
+ //try sister field, to prove that default analyzer is ucVowelAnalyzer for
+ //unspecified fieldsd
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD4+":lmnop")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD4+":lm*op")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD4+":LMNOP")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD4+":LM*OP")));
+
+ //try mismatching sister field
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD3+":lmnop")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD3+":lm*op")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD3+":LMNOP")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD3+":LM*OP")));
+
+ //advanced, correct set up (for wt but not for mt)
+ p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, baseAnalyzer);
+ assertEquals(p.getNormMultiTerms(), NORM_MULTI_TERMS.LOWERCASE);
+ wt.clear(); mt.clear();
+ wt.put(FIELD2, ucVowelAnalyzer);
+ p.setAnalyzers(wt, mt);
+ assertEquals(NORM_MULTI_TERMS.ANALYZE, p.getNormMultiTerms());
+ assertEquals(1, countDocs((SpanQuery)p.parse("lmnop")));
+ assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmnop")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNOP")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lm*op")));
+
+ //advanced, correct set up for both
+ wt.clear(); mt.clear();
+ wt.put(FIELD2, ucVowelAnalyzer);
+ mt.put(FIELD2, ucVowelAnalyzer);
+ assertEquals(NORM_MULTI_TERMS.ANALYZE, p.getNormMultiTerms());
+ assertEquals(1, countDocs((SpanQuery)p.parse("lmnop")));
+ assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmnop")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNOP")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lm*op")));
+
+ p.setNormMultiTerms(NORM_MULTI_TERMS.NONE);
+ assertEquals(NORM_MULTI_TERMS.NONE, p.getNormMultiTerms());
+ assertEquals(1, countDocs((SpanQuery)p.parse("lmnop")));
+ //analyzer still used on whole terms; don't forget!
+ assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP")));
+ assertEquals(0, countDocs((SpanQuery)p.parse("LM*OP")));
+
+ p.setNormMultiTerms(NORM_MULTI_TERMS.LOWERCASE);
+ assertEquals(NORM_MULTI_TERMS.LOWERCASE, p.getNormMultiTerms());
+ assertEquals(1, countDocs((SpanQuery)p.parse("lmnop")));
+ assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP")));
+ assertEquals(1, countDocs((SpanQuery)p.parse("LM*OP")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP")));
+
+ //mismatch between default field and default analyzer; should return 0
+ p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, ucVowelAnalyzer);
+ assertEquals(0, countDocs((SpanQuery)p.parse("lmnop")));
+ assertEquals(0, countDocs((SpanQuery)p.parse("LMNOP")));
+ assertEquals(0, countDocs((SpanQuery)p.parse("lmnOp")));
+
+ p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, baseAnalyzer, ucVowelAnalyzer);
+ //cstr with two analyzers sets normMultiTerms = NORM_MULTI_TERM.ANALYZE
+ //can't find any in field1 because these trigger multiTerm analysis
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":lm*op")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":lmno*")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":lmmop~1")));
+
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":LM*OP")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":LMNO*")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":LMMOP~1")));
+
+ //can find these in field2 because of multiterm analysis
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lm*op")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmno*")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmmop~1")));
+
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNO*")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMMOP~1")));
+
+ //try basic use case
+ p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, baseAnalyzer);
+ //can't find these in field2 because multiterm analysis is using baseAnalyzer
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lm*op")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmno*")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmmop~1")));
+
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMNO*")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMMOP~1")));
+
+ p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, baseAnalyzer);
+ p.setNormMultiTerms(NORM_MULTI_TERMS.ANALYZE);
+ wt.clear();
+ wt.put(FIELD2, ucVowelAnalyzer);
+ mt.clear();
+ mt.put(FIELD2, ucVowelAnalyzer);
+ p.setAnalyzers(wt, mt);
+
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmnop")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lm*op")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmno*")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmmop~1")));
+
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNOP")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNO*")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMMOP~1")));
+
+
+ //now try adding the wrong analyzer for the whole term, but the
+ //right multiterm analyzer
+ wt.put(FIELD2, baseAnalyzer);
+ p.setAnalyzers(wt, mt);
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmnop")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lm*op")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmno*")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmmop~1")));
+
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMNOP")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNO*")));
+ assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMMOP~1")));
+
+ //now set them completely improperly
+ mt.put(FIELD2, baseAnalyzer);
+ p.setAnalyzers(wt, mt);
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmnop")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lm*op")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmno*")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmmop~1")));
+
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMNOP")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMNO*")));
+ assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMMOP~1")));
+
+ }
+
+ private void countSpansDocs(SpanQueryParser p, String s, int spanCount,
+ int docCount) throws Exception {
+ SpanQuery q = (SpanQuery)p.parse(s);
+ assertEquals("spanCount: " + s, spanCount, countSpans(q));
+ assertEquals("docCount: " + s, docCount, countDocs(q));
+
+ }
+
+ private long countSpans(SpanQuery q) throws Exception {
+ List ctxs = reader.leaves();
+ assert (ctxs.size() == 1);
+ AtomicReaderContext ctx = ctxs.get(0);
+ q = (SpanQuery) q.rewrite(ctx.reader());
+ Spans spans = q.getSpans(ctx, null, new HashMap());
+
+ long i = 0;
+ while (spans.next()) {
+ i++;
+ }
+ return i;
+ }
+
+ private long countDocs(SpanQuery q) throws Exception {
+ OpenBitSet docs = new OpenBitSet();
+ List ctxs = reader.leaves();
+ assert (ctxs.size() == 1);
+ AtomicReaderContext ctx = ctxs.get(0);
+ IndexReaderContext parentCtx = reader.getContext();
+ q = (SpanQuery) q.rewrite(ctx.reader());
+
+ Set qTerms = new HashSet();
+ q.extractTerms(qTerms);
+ Map termContexts = new HashMap();
+
+ for (Term t : qTerms) {
+ TermContext c = TermContext.build(parentCtx, t);
+ termContexts.put(t, c);
+ }
+
+ Spans spans = q.getSpans(ctx, null, termContexts);
+
+ while (spans.next()) {
+ docs.set(spans.doc());
+ }
+ long spanDocHits = docs.cardinality();
+ // double check with a regular searcher
+ TotalHitCountCollector coll = new TotalHitCountCollector();
+ searcher.search(q, coll);
+ assertEquals(coll.getTotalHits(), spanDocHits);
+ return spanDocHits;
+
+ }
+
+
+
+ /**
+ * Mocks a synonym filter. When it encounters "abc" it adds "qrs" and "tuv"
+ */
+ private final static class MockSynFilter extends TokenFilter {
+ private List synBuffer = new LinkedList();
+
+ private final CharTermAttribute termAtt;
+ private final PositionIncrementAttribute posIncrAtt;
+
+ public MockSynFilter(TokenStream in) {
+ super(in);
+ termAtt = addAttribute(CharTermAttribute.class);
+ posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ }
+
+ @Override
+ public final boolean incrementToken() throws java.io.IOException {
+ if (synBuffer.size() > 0) {
+ termAtt.setEmpty().append(synBuffer.remove(0));
+ posIncrAtt.setPositionIncrement(0);
+ return true;
+ } else {
+ boolean next = input.incrementToken();
+ if (!next) {
+ return false;
+ }
+ String text = termAtt.toString();
+ if (text.equals("abc")){
+ synBuffer.add("qrs");
+ synBuffer.add("tuv");
+ }
+ return true;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ }
+ }
+
+
+ /**
+ * Mocks what happens in a non-whitespace language. Tokenizes on white space and "_".
+ */
+ private final static class MockNonWhitespaceFilter extends TokenFilter {
+ private List buffer = new LinkedList();
+
+ private final CharTermAttribute termAtt;
+
+ public MockNonWhitespaceFilter(TokenStream in) {
+ super(in);
+ termAtt = addAttribute(CharTermAttribute.class);
+ }
+
+ @Override
+ public final boolean incrementToken() throws java.io.IOException {
+ if (buffer.size() > 0) {
+ termAtt.setEmpty().append(buffer.remove(0));
+ return true;
+ } else {
+ boolean next = input.incrementToken();
+ if (!next) {
+ return false;
+ }
+ String text = termAtt.toString();
+
+ String[] bits = text.split("_");
+ String ret = text;
+ if (bits.length > 1){
+ ret = bits[0];
+ for (int i = 1; i < bits.length; i++){
+ buffer.add(bits[i]);
+ }
+ }
+ termAtt.setEmpty().append(ret);
+ return true;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ }
+ }
+
+
+ //mocks uppercasing vowels to test different analyzers for different fields
+ private final static class MockUCVowelFilter extends TokenFilter {
+ private final Pattern PATTERN = Pattern.compile("([aeiou])");
+ private final CharTermAttribute termAtt;
+
+ public MockUCVowelFilter(TokenStream in) {
+ super(in);
+ termAtt = addAttribute(CharTermAttribute.class);
+ }
+
+ @Override
+ public final boolean incrementToken() throws java.io.IOException {
+
+ boolean next = input.incrementToken();
+ if (!next) {
+ return false;
+ }
+ String text = termAtt.toString().toLowerCase();
+ Matcher m = PATTERN.matcher(text);
+ StringBuffer sb = new StringBuffer();
+ while (m.find()){
+ m.appendReplacement(sb, m.group(1).toUpperCase());
+ }
+ m.appendTail(sb);
+ text = sb.toString();
+ termAtt.setEmpty().append(text);
+ return true;
+
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ }
+ }
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java (revision 0)
@@ -0,0 +1,988 @@
+package org.apache.lucene.queryparser.spans;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CachingTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.flexible.core.util.UnescapedCharSequence;
+import org.apache.lucene.queryparser.flexible.standard.parser.EscapeQuerySyntaxImpl;
+import org.apache.lucene.sandbox.queries.SlowFuzzyQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.RegexpQuery;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanNotQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.util.BytesRef;
+/**
+ * This class overrides some important functionality within QueryParserBase, esp.
+ * for generating terminal spanquery nodes: term, range, regex, fuzzy, etc.
+ *
+ * When SpanQueries are eventually nuked, there should be an easyish
+ * refactoring of classes that extend this class to extend QueryParserBase.
+ *
+ * This should also allow for an easy transfer to javacc or similar.
+ *
+ */
+public abstract class SpanQueryParserBase extends AnalyzingQueryParserBase{
+
+ //better to make these public in QueryParserBase
+ public static final int CONJ_NONE = 0;
+ public static final int CONJ_AND = 1;
+ public static final int CONJ_OR = 2;
+
+ public static final int MOD_NONE = 0;
+ public static final int MOD_NOT = 10;
+ public static final int MOD_REQ = 11;
+
+
+ public static final float UNSPECIFIED_BOOST = -1.0f;
+ public static final int UNSPECIFIED_SLOP = -1;
+ public static final Boolean UNSPECIFIED_IN_ORDER = null;
+ public static final float DEFAULT_BOOST = 1.0f;
+
+ public static final boolean DEFAULT_IN_ORDER = true;
+
+
+
+ private static final Pattern FUZZY_PATTERN = Pattern
+ .compile("~(>)?(?:(\\d+)?(?:\\.(\\d+))?)?(?:,(\\d+))?$");
+ private final Pattern WILDCARD_PATTERN = Pattern.compile("([?*]+)");
+
+
+ private int spanNearMaxDistance = 100;
+ private int spanNotNearMaxDistance = 50;
+ //if a full term is analyzed and the analyzer returns nothing,
+ //should a ParseException be thrown or should I just ignore the full token.
+ private boolean throwExceptionForEmptyTerm = false;
+
+
+
+
+ ///////
+ // Override getXQueries to return span queries
+ // Lots of boilerplate. Sorry.
+ //////
+
+ //not overriding: newMatchAllDocsQuery
+
+ @Override
+ protected Query newRegexpQuery(Term t){
+ Query q = super.newRegexpQuery(t);
+ return new SpanMultiTermQueryWrapper((RegexpQuery) q);
+ }
+
+
+ /**
+ * Factory method for generating a query (similar to
+ * {@link #getWildcardQuery}). Called when parser parses
+ * an input term token that has the fuzzy suffix (~) appended.
+ *
+ * @param field Name of the field query will use.
+ * @param termStr Term token to use for building term for the query
+ *
+ * @return Resulting {@link org.apache.lucene.search.Query} built for the term
+ * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow
+ */
+ protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException
+ {
+ return getFuzzyQuery(field, termStr, minSimilarity, getFuzzyPrefixLength(), FuzzyQuery.defaultTranspositions);
+ }
+ /**
+ * Factory method for generating a query (similar to
+ * {@link #getWildcardQuery}). Called when parser parses
+ * an input term token that has the fuzzy suffix (~) appended.
+ *
+ * @param field Name of the field query will use.
+ * @param termStr Term token to use for building term for the query
+ *
+ * @return Resulting {@link org.apache.lucene.search.Query} built for the term
+ * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow
+ */
+ protected Query getFuzzyQuery(String field, String termStr,
+ float minSimilarity, int prefixLength) throws ParseException
+ {
+ return getFuzzyQuery(field, termStr, minSimilarity, prefixLength, FuzzyQuery.defaultTranspositions);
+ }
+
+ /**
+ *
+ * @param field
+ * @param termStr
+ * @param minSimilarity
+ * @param prefixLength
+ * @param transpositions
+ * @return query
+ * @throws ParseException, RuntimeException if there was an IOException from the analysis process
+ */
+ protected Query getFuzzyQuery(String field, String termStr,
+ float minSimilarity, int prefixLength, boolean transpositions) throws ParseException{
+ if (getNormMultiTerms() == NORM_MULTI_TERMS.ANALYZE){
+ termStr = analyzeMultitermTermParseEx(field, termStr).utf8ToString();
+ } else if (getNormMultiTerms() == NORM_MULTI_TERMS.LOWERCASE){
+ termStr = termStr.toLowerCase(getLocale());
+ }
+ Term t = new Term(field, unescape(termStr));
+ return newFuzzyQuery(t, minSimilarity, prefixLength, transpositions);
+ }
+
+
+
+ /**
+ * Creates a new fuzzy term.
+ * If minimumSimilarity is >= 1.0f, this rounds to avoid
+ * exception for numEdits != whole number.
+ *
+ * @param t
+ * @param minimumSimilarity
+ * @param prefixLength
+ * @param transpositions
+ * @return fuzzy query
+ */
+ protected Query newFuzzyQuery(Term t, float minimumSimilarity, int prefixLength,
+ boolean transpositions){
+
+ if (minimumSimilarity <=0.0f){
+ return newTermQuery(t);
+ }
+ String text = t.text();
+ int numEdits = 0;
+ int len = text.codePointCount(0, text.length());
+ if (getFuzzyMinSim() < 1.0f){
+ //if both are < 1.0 then make sure that parameter that was passed in
+ //is >= than fuzzyminsim
+ if (minimumSimilarity < 1.0f){
+ minimumSimilarity = (minimumSimilarity < getFuzzyMinSim())? getFuzzyMinSim() : minimumSimilarity;
+
+ numEdits = unboundedFloatToEdits(minimumSimilarity, len);
+
+ } else {
+ //if fuzzyMinSim < 1.0 and the parameter that was passed in
+ //is >= 1, convert that to a %, test against fuzzyminsim and then
+ //recalculate numEdits
+ float tmpSim = (len-minimumSimilarity)/(float)len;
+ tmpSim = (tmpSim < getFuzzyMinSim())? getFuzzyMinSim() : tmpSim;
+ numEdits = unboundedFloatToEdits(tmpSim, len);
+ }
+ } else {
+ //if fuzzyMinSim >= 1.0f
+
+ if (minimumSimilarity < 1.0f){
+ int tmpNumEdits = unboundedFloatToEdits(minimumSimilarity, len);
+ numEdits = (tmpNumEdits >= (int)getFuzzyMinSim())?(int)getFuzzyMinSim() : tmpNumEdits;
+ } else {
+ numEdits = (minimumSimilarity >= getFuzzyMinSim())? (int) getFuzzyMinSim() : (int)minimumSimilarity;
+ }
+ }
+ /*
+ * This currently picks btwn FQ and SlowFQ based on numEdits.
+ * This is only because SFQ doesn't allow transpositions yet.
+ * Once SFQ does allow transpositions, this can be changed to
+ * run SFQ only...because SFQ does the right thing and returns
+ * an Automaton for numEdits <= 2.
+ */
+ if (numEdits <= FuzzyQuery.defaultMaxEdits){
+ FuzzyQuery fq =new FuzzyQuery(t, numEdits, prefixLength, FuzzyQuery.defaultMaxExpansions,
+ transpositions);
+ fq.setRewriteMethod(getMultiTermRewriteMethod());
+ return new SpanMultiTermQueryWrapper(fq);
+
+ } else {
+ SlowFuzzyQuery sfq = new SlowFuzzyQuery(t,
+ numEdits, prefixLength);
+ sfq.setRewriteMethod(getMultiTermRewriteMethod());
+ return new SpanMultiTermQueryWrapper(sfq);
+ }
+ }
+
+
+
+ @Override
+ protected Query newPrefixQuery(Term t){
+ PrefixQuery q = new PrefixQuery(t);
+ q.setRewriteMethod(getMultiTermRewriteMethod());
+ return new SpanMultiTermQueryWrapper(q);
+
+ }
+ /**
+ * Factory method for generating a query (similar to
+ * {@link #getWildcardQuery}). Called when parser parses an input term
+ * token that uses prefix notation; that is, contains a single '*' wildcard
+ * character as its last character. Since this is a special case
+ * of generic wildcard term, and such a query can be optimized easily,
+ * this usually results in a different query object.
+ *
+ * Depending on settings, a prefix term may be lower-cased
+ * automatically. It will not go through the default Analyzer,
+ * however, since normal Analyzers are unlikely to work properly
+ * with wildcard templates.
+ *
+ * Can be overridden by extending classes, to provide custom handling for
+ * wild card queries, which may be necessary due to missing analyzer calls.
+ *
+ * @param field Name of the field query will use.
+ * @param termStr Term token to use for building term for the query
+ * (without trailing '*' character!)
+ *
+ * @return Resulting {@link org.apache.lucene.search.Query} built for the term
+ * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow
+ */
+ protected Query getPrefixQuery(String field, String termStr) throws ParseException
+ {
+
+ if (!getAllowLeadingWildcard() && termStr.startsWith("*"))
+ throw new ParseException("'*' not allowed as first character in PrefixQuery");
+
+ if (getNormMultiTerms() == NORM_MULTI_TERMS.ANALYZE){
+ termStr = analyzeMultitermTermParseEx(field, termStr).utf8ToString();
+ } else if (getNormMultiTerms() == NORM_MULTI_TERMS.LOWERCASE){
+ termStr = termStr.toLowerCase(getLocale());
+ }
+ Term t = new Term(field, unescape(termStr));
+ return newPrefixQuery(t);
+ }
+
+ @Override
+ protected Query newWildcardQuery(Term t){
+ WildcardQuery q = new WildcardQuery(t);
+ q.setRewriteMethod(getMultiTermRewriteMethod());
+ return new SpanMultiTermQueryWrapper(q);
+ }
+ /**
+ * Factory method for generating a query. Called when parser
+ * parses an input term token that contains one or more wildcard
+ * characters (? and *), but is not a prefix term token (one
+ * that has just a single * character at the end)
+ *
+ * Depending on settings, prefix term may be lower-cased
+ * automatically. It will not go through the default Analyzer,
+ * however, since normal Analyzers are unlikely to work properly
+ * with wildcard templates.
+ *
+ * Can be overridden by extending classes, to provide custom handling for
+ * wildcard queries, which may be necessary due to missing analyzer calls.
+ *
+ * @param field Name of the field query will use.
+ * @param termStr Term token that contains one or more wild card
+ * characters (? or *), but is not simple prefix term
+ *
+ * @return Resulting {@link org.apache.lucene.search.Query} built for the term
+ * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow
+ */
+ @Override
+ protected Query getWildcardQuery(String field, String termStr) throws ParseException
+ {
+ if ("*".equals(field)) {
+ if ("*".equals(termStr)) return newMatchAllDocsQuery();
+ }
+ if (!getAllowLeadingWildcard() && (termStr.startsWith("*") || termStr.startsWith("?")))
+ throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery");
+
+ if (getNormMultiTerms() == NORM_MULTI_TERMS.ANALYZE){
+
+ termStr = analyzeWildcard(field, termStr);
+ } else if (getNormMultiTerms() == NORM_MULTI_TERMS.LOWERCASE){
+ termStr = termStr.toLowerCase(getLocale());
+ }
+
+ Term t = new Term(field, termStr);
+ return newWildcardQuery(t);
+ }
+
+
+ /**
+ * Builds a new {@link TermRangeQuery} instance.
+ * Will convert to lowercase if {@link #getLowercaseExpandedTerms()} == true.
+ * Will analyze terms if {@link #getAnalyzeRangeTerms()} == true.
+ *
+ *
+ * @param field Field
+ * @param part1 min
+ * @param part2 max
+ * @param startInclusive true if the start of the range is inclusive
+ * @param endInclusive true if the end of the range is inclusive
+ * @return new {@link TermRangeQuery} instance
+ */
+ @Override
+ protected Query newRangeQuery(String field, String part1, String part2,
+ boolean startInclusive, boolean endInclusive) {
+ //TODO: modify newRangeQuery in QueryParserBase to throw ParseException for failure of analysis
+ //need to copy and paste this until we can change analyzeMultiterm(String field, String part) to protected
+ //if we just returned a spanmultitermwrapper around super.newRangeQuery(), analyzeMultiterm would use
+ //the analyzer, but not the multitermAnalyzer
+ String start = null;
+ String end = null;
+
+ if (part1 == null) {
+ start = null;
+ } else {
+ if (getAnalyzeRangeTerms()){
+ try {
+ start = analyzeMultitermTermParseEx(field, part1).utf8ToString();
+ } catch (ParseException e){
+ //swallow
+ }
+ }
+ if ((start == null && getAnalyzeRangeTerms()) ||
+ getNormMultiTerms() == NORM_MULTI_TERMS.LOWERCASE){
+ start = part1.toLowerCase(getLocale());
+ } else {
+ start = part1;
+ }
+ }
+
+ if (part2 == null) {
+ end = null;
+ } else {
+ if (getAnalyzeRangeTerms()){
+ try {
+ end = analyzeMultitermTermParseEx(field, part1).utf8ToString();
+ } catch (ParseException e){
+ //swallow..doh!
+ }
+ }
+ if ((end == null && getAnalyzeRangeTerms()) ||
+ getLowercaseExpandedTerms()){
+ end = part2.toLowerCase(getLocale());
+ } else {
+ end = part2;
+ }
+ }
+
+ final TermRangeQuery query =
+ TermRangeQuery.newStringRange(field, unescape(start), unescape(end), startInclusive, endInclusive);
+
+ query.setRewriteMethod(getMultiTermRewriteMethod());
+ return new SpanMultiTermQueryWrapper(query);
+ }
+
+
+
+
+ /**
+ * This identifies and then builds the various single term and/or multiterm
+ * queries. This does not identify a regex or range term query!
+ *
+ *
+ * For {@link org.apache.lucene.search.FuzzyQuery}, this defaults to
+ * {@link org.apache.lucene.search.FuzzyQuery.defaultMaxEdits}
+ * if no value is specified after the ~.
+ * * @param field
+ * @param termText
+ * @param quoted
+ * @return SpanQuery or null if termText is a stop word
+ * @throws ParseException
+ */
+ public Query buildAnySingleTermQuery(String field, String termText, boolean quoted) throws ParseException {
+ Query q = null;
+
+ // is this a fuzzy term?
+ Matcher m = FUZZY_PATTERN.matcher(termText);
+ if (m.find() && ! isCharEscaped(termText, m.start())) {
+ String term = termText.substring(0, m.start());
+ String transposString = m.group(1);
+ String minSimilarityString = m.group(2);
+ String decimalComponent = m.group(3);
+ String prefixLenString = m.group(4);
+ float minSimilarity = (float) FuzzyQuery.defaultMaxEdits;
+ if (minSimilarityString != null && minSimilarityString.length() > 0) {
+ if (decimalComponent == null || decimalComponent.length() == 0) {
+ decimalComponent = "0";
+ }
+ try {
+ minSimilarity = Float.parseFloat(minSimilarityString + "." + decimalComponent);
+ } catch (NumberFormatException e) {
+ // shouldn't ever happen. If it does, fall back to original value of
+ // slop
+ // swallow
+ }
+ }
+
+ // if the user enters 2.4 for example, round it so that there won't be
+ // an
+ // illegalparameter exception
+ if (minSimilarity >= 1.0f) {
+ minSimilarity = (float) Math.round(minSimilarity);
+ }
+
+ int prefixLen = getFuzzyPrefixLength();
+ if (prefixLenString != null){
+ try{
+ prefixLen = Integer.parseInt(prefixLenString);
+ } catch (NumberFormatException e){
+ //swallow
+ }
+ }
+ boolean transpositions = (transposString != null) ? false : true;
+
+ q = getFuzzyQuery(field, term, minSimilarity, prefixLen, transpositions);
+ return q;
+ }
+
+ // is this a wildcard term?
+ m = WILDCARD_PATTERN.matcher(termText);
+ Set ws = new HashSet();
+ while (m.find()) {
+ if (! isCharEscaped(termText, m.start())){
+ ws.add(m.start());
+ }
+ }
+ if (ws.size() > 0) {
+
+ if (ws.size() == 1 // there's only one wildcard character
+ && ws.contains(termText.length() - 1) // it isn't escaped
+ && termText.indexOf("*") == termText.length() - 1 // it is * not ?
+ && termText.length() > 1) { //it isn't just * by itself
+ // snip final *
+ q = getPrefixQuery(field,
+ termText.substring(0, termText.length() - 1));
+ } else {
+ q = getWildcardQuery(field, termText);
+ }
+ }
+ // if you've found anything, return it
+ if (q != null) {
+ return q;
+ }
+ // treat as basic single term query
+
+ return getFieldQuery(field, termText, quoted);
+ }
+
+
+ @Override
+ protected Query newTermQuery(Term t){
+ t = unescape(t);
+ return new SpanTermQuery(t);
+ }
+
+ @Override
+ protected Query getFieldQuery(String field, String termText, boolean quoted)
+ throws ParseException {
+ return newFieldQuery(getWholeTermAnalyzer(field), field, termText, quoted);
+ }
+
+ @Override
+ protected Query getFieldQuery(String field, String queryText, int slop)
+ throws ParseException {
+ Query query = getFieldQuery(field, queryText, true);
+
+ if (query instanceof SpanNearQuery) {
+ if (((SpanNearQuery)query).getSlop() != slop){
+ slop = (slop > spanNearMaxDistance) ? spanNearMaxDistance : slop;
+ SpanQuery[] clauses = ((SpanNearQuery) query).getClauses();
+ query = new SpanNearQuery(clauses, slop, true);
+ }
+ }
+
+ return query;
+ }
+ /**
+ * Build what appears to be a simple single term query. If the analyzer breaks
+ * it into multiple terms, treat that as a "phrase" or as an "or" depending on
+ * the value of {@link #autoGeneratePhraseQueries}.
+ *
+ * Can return null!
+ * @param field
+ * @param termText
+ * @return query
+ * @throws ParseException
+ */
+ @Override
+ protected Query newFieldQuery(Analyzer analyzer, String field, String termText, boolean quoted)
+ throws ParseException {
+ //largely plagiarized from QueryParserBase
+ TokenStream source;
+ try {
+ source = analyzer.tokenStream(field, termText);
+ source.reset();
+ } catch (IOException e) {
+ ParseException p = new ParseException("Unable to initialize TokenStream to analyze query text");
+ p.initCause(e);
+ throw p;
+ }
+ CachingTokenFilter buffer = new CachingTokenFilter(source);
+ TermToBytesRefAttribute termAtt = null;
+ PositionIncrementAttribute posIncrAtt = null;
+ OffsetAttribute offsetAtt = null;
+ int numTokens = 0;
+
+ buffer.reset();
+
+ if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
+ termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
+ }
+ if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
+ posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
+ }
+ if (buffer.hasAttribute(OffsetAttribute.class)){
+ offsetAtt = buffer.getAttribute(OffsetAttribute.class);
+ }
+
+ boolean hasMoreTokens = false;
+ if (termAtt != null) {
+ try {
+ hasMoreTokens = buffer.incrementToken();
+ while (hasMoreTokens) {
+ numTokens++;
+ hasMoreTokens = buffer.incrementToken();
+ }
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+ try {
+ // rewind the buffer stream
+ buffer.reset();
+ //source.end();
+ // close original stream - all tokens buffered
+ source.close();
+ }
+ catch (IOException e) {
+ ParseException p = new ParseException("Cannot close TokenStream analyzing query text");
+ p.initCause(e);
+ throw p;
+ }
+
+ BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();
+
+ if (numTokens == 0){
+ if (throwExceptionForEmptyTerm){
+ throw new ParseException("Couldn't find any content term in: "+ termText);
+ }
+ return null;
+ } else if (numTokens == 1) {
+ try {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ termAtt.fillBytesRef();
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+ return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
+ } else {
+
+ List queries = new ArrayList();
+ try{
+ if (posIncrAtt != null){
+ analyzeComplexSingleTerm(field, buffer, termAtt, bytes, posIncrAtt, queries);
+ } else if (offsetAtt != null){
+ analyzeComplexSingleTerm(field, buffer, termAtt, bytes, offsetAtt, queries);
+ } else {
+ while (buffer.incrementToken()) {
+ termAtt.fillBytesRef();
+ queries.add((SpanTermQuery)newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))));
+ }
+ }
+ } catch (IOException e){
+ //ignore
+ }
+ List nonEmpties = new LinkedList();
+ for (SpanQuery piece : queries) {
+ if (piece != null) {
+ nonEmpties.add(piece);
+ } else if (piece == null && throwExceptionForEmptyTerm) {
+ throw new ParseException("Stop word found in " + termText);
+ }
+ }
+
+ if (nonEmpties.size() == 0) {
+ return getEmptySpanQuery();
+ }
+ if (nonEmpties.size() == 1) {
+ return nonEmpties.get(0);
+ }
+ SpanQuery[] ret = nonEmpties
+ .toArray(new SpanQuery[nonEmpties.size()]);
+ if (quoted || getAutoGeneratePhraseQueries() == true) {
+ return new SpanNearQuery(ret, 0, true);
+ } else {
+ return new SpanOrQuery(ret);
+ }
+ }
+ }
+
+
+
+ private void analyzeComplexSingleTerm(String field,
+ CachingTokenFilter ts, TermToBytesRefAttribute termAtt, BytesRef bytes,
+ OffsetAttribute offAtt,
+ List queries) throws IOException {
+ int lastStart = -1;
+ while (ts.incrementToken()) {
+ termAtt.fillBytesRef();
+ //if start is the same, treat it as a synonym...ignore end because
+ //of potential for shingles
+ if (lastStart > -1 && offAtt.startOffset() == lastStart)
+ //&& offAttr.endOffset() == lastEnd)
+ {
+
+ handleSyn(queries, (SpanTermQuery)newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))));
+ } else {
+
+ queries.add((SpanTermQuery)newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))));
+ }
+ lastStart = offAtt.startOffset();
+ }
+
+ }
+
+ private void analyzeComplexSingleTerm(String field,
+ CachingTokenFilter ts, TermToBytesRefAttribute termAtt, BytesRef bytes,
+ PositionIncrementAttribute posAtt,
+ List queries) throws IOException{
+ while (ts.incrementToken()) {
+ termAtt.fillBytesRef();
+ if (posAtt.getPositionIncrement() == 0){
+ handleSyn(queries, (SpanTermQuery)newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))));
+ } else {
+ //add null for stop words
+ for (int i = 1; i < posAtt.getPositionIncrement(); i++) {
+ queries.add(null);
+ }
+ queries.add((SpanTermQuery)newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))));
+ }
+ }
+
+ }
+
+ private void handleSyn(List queries, SpanQuery currQuery) {
+ assert(queries != null);
+ //grab the last query added to queries
+ SpanQuery last = null;
+ boolean removed = false;
+ if (queries.size() > 0){
+ last = queries.remove(queries.size()-1);
+ removed = true;
+ }
+ //if it exists and does not equal null
+ if (last != null){
+ if (last instanceof SpanOrQuery){
+ ((SpanOrQuery)last).addClause(currQuery);
+ } else {
+ SpanQuery tmp = last;
+ last = new SpanOrQuery();
+ ((SpanOrQuery)last).addClause(tmp);
+ ((SpanOrQuery)last).addClause(currQuery);
+ }
+ queries.add(last);
+ } else {
+ //if you actually removed a null, put it back on
+ if (removed){
+ queries.add(null);
+ }
+ //then add the new term
+ queries.add(currQuery);
+ }
+ }
+
+ /**
+ *
+ * @param clauses
+ * @return {@link org.apache.lucene.search.spans.SpanOrQuery} might be empty if clauses is null or contains
+ * only empty queries
+ */
+ protected SpanQuery buildSpanOrQuery(List clauses)
+ throws ParseException{
+ if (clauses == null || clauses.size() == 0)
+ return getEmptySpanQuery();
+
+ List nonEmpties = removeEmpties(clauses);
+ if (nonEmpties.size() == 0) {
+ return getEmptySpanQuery();
+ }
+ if (nonEmpties.size() == 1)
+ return nonEmpties.get(0);
+
+ SpanQuery[] arr = nonEmpties.toArray(new SpanQuery[nonEmpties.size()]);
+ return new SpanOrQuery(arr);
+
+ }
+
+
+ protected SpanQuery buildSpanNearQuery(List clauses, int slop,
+ Boolean inOrder) throws ParseException {
+ if (clauses == null || clauses.size() == 0)
+ return getEmptySpanQuery();
+
+ List nonEmpties = removeEmpties(clauses);
+
+ if (nonEmpties.size() == 0) {
+ return getEmptySpanQuery();
+ }
+ if (nonEmpties.size() == 1){
+ SpanQuery child = nonEmpties.get(0);
+ //if single child is itself a SpanNearQuery, inherit slop and inorder
+ if (child instanceof SpanNearQuery){
+ SpanQuery[] childsClauses = ((SpanNearQuery)child).getClauses();
+ child = new SpanNearQuery(childsClauses, slop, inOrder);
+ }
+ }
+
+ if (slop == UNSPECIFIED_SLOP){
+ slop = getPhraseSlop();
+ } else if (slop > spanNearMaxDistance) {
+ slop = spanNearMaxDistance;
+ }
+
+ boolean localInOrder = DEFAULT_IN_ORDER;
+ if (inOrder != UNSPECIFIED_IN_ORDER){
+ localInOrder = inOrder.booleanValue();
+ }
+
+ SpanQuery[] arr = nonEmpties.toArray(new SpanQuery[nonEmpties.size()]);
+ return new SpanNearQuery(arr, slop, localInOrder);
+ }
+
+ /**
+ * This is meant to "fix" two cases that might be surprising to a
+ * non-whitespace language speaker. If a user entered, e.g. "\u5927\u5B66"~3,
+ * and {@link #autoGeneratePhraseQueries} is set to true, then the parser
+ * would treat this recursively and yield [[\u5927\u5B66]]~3 by default. The user
+ * probably meant: find those two characters within three words of each other,
+ * not find those right next to each other and that hit has to be within three
+ * words of nothing.
+ *
+ * If a user entered the same thing and {@link #autoGeneratePhraseQueries} is
+ * set to false, then the parser would treat this as [(\u5927\u5B66)]~3: find
+ * one character or the other and then that hit has to be within three words
+ * of nothing...not the desired outcome * @param field
+ *
+ *
+ * @param termText this is the sole child of a SpanNearQuery as identified by a whitespace-based tokenizer
+ * @param ancestralSlop
+ * @param ancestralInOrder
+ * @return query
+ * @throws ParseException
+ */
+ protected Query specialHandlingForSpanNearWithOneComponent(String field,
+ String termText,
+ int ancestralSlop, Boolean ancestralInOrder) throws ParseException {
+ Query q = newFieldQuery(getWholeTermAnalyzer(field), field, termText, true);
+ if (q instanceof SpanNearQuery){
+ SpanQuery[] childsClauses = ((SpanNearQuery)q).getClauses();
+ return buildSpanNearQuery(Arrays.asList(childsClauses), ancestralSlop, ancestralInOrder);
+ }
+ return q;
+ }
+
+ /* protected Query specialHandlingForSpanNearWithOneComponent(String field,
+ String termText, int mySlop, Boolean myInOrder,
+ int ancestralSlop, Boolean ancestralInOrder) throws ParseException {
+ Query q = newFieldQuery(getAnalyzer(), field, termText, false);
+ if (q instanceof SpanNearQuery){
+ SpanQuery[] childsClauses = ((SpanNearQuery)q).getClauses();
+ if (mySlop == UNSPECIFIED_SLOP && myInOrder == UNSPECIFIED_IN_ORDER){
+ return buildSpanNearQuery(Arrays.asList(childsClauses), ancestralSlop, ancestralInOrder);
+ } else {
+ return buildSpanNearQuery(Arrays.asList(childsClauses), mySlop, myInOrder);
+ }
+ }
+ return q;
+ }
+ */
+ /**
+ *
+ * @param clauses
+ * @param pre
+ * @param post
+ * @return span not query
+ * @throws ParseException
+ */
+ protected SpanQuery buildSpanNotNearQuery(List clauses, int pre,
+ int post) throws ParseException {
+ if (clauses.size() != 2) {
+ throw new ParseException(
+ String.format("SpanNotNear query must have two clauses. I count %d",
+ clauses.size()));
+ }
+ // if include is an empty query, treat this as just an empty query
+ if (isEmptyQuery(clauses.get(0))) {
+ return clauses.get(0);
+ }
+ // if exclude is an empty query, return include alone
+ if (isEmptyQuery(clauses.get(1))) {
+ return clauses.get(0);
+ }
+
+ if (pre > spanNotNearMaxDistance) {
+ pre = spanNotNearMaxDistance;
+ }
+ if (post > spanNotNearMaxDistance) {
+ post = spanNotNearMaxDistance;
+ }
+ return new SpanNotQuery(clauses.get(0), clauses.get(1), pre, post);
+ }
+
+
+ private List removeEmpties(List queries)
+ throws ParseException{
+
+ List nonEmpties = new ArrayList();
+ for (SpanQuery q : queries) {
+ if (!isEmptyQuery(q)) {
+ nonEmpties.add(q);
+ } else if (throwExceptionForEmptyTerm){
+ throw new ParseException("Stop word or unparseable term found");
+ }
+ }
+ return nonEmpties;
+ }
+
+ public SpanQuery getEmptySpanQuery() {
+ SpanQuery q = new SpanOrQuery(new SpanTermQuery[0]);
+ return q;
+ }
+
+ public boolean isEmptyQuery(Query q) {
+ if (q == null ||
+ q instanceof SpanOrQuery && ((SpanOrQuery) q).getClauses().length == 0) {
+ return true;
+ }
+ return false;
+ }
+
+ public static Term unescape(Term t){
+
+ String txt = t.text();
+ try{
+ UnescapedCharSequence un = EscapeQuerySyntaxImpl.discardEscapeChar(txt);
+
+ if (! un.toString().equals(txt)){
+ t = new Term(t.field(),un.toString());
+ }
+ } catch (org.apache.lucene.queryparser.flexible.standard.parser.ParseException e){
+ //swallow;
+ }
+
+ return t;
+ }
+
+ public static String unescape(String s){
+ try{
+ UnescapedCharSequence un = EscapeQuerySyntaxImpl.discardEscapeChar(s);
+ return un.toString();
+ } catch (org.apache.lucene.queryparser.flexible.standard.parser.ParseException e){
+ //swallow;
+ }
+
+ return s;
+
+ }
+
+
+ public int getSpanNearMaxDistance() {
+ return spanNearMaxDistance;
+ }
+
+ public void setSpanNearMaxDistance(int spanNearMaxDistance) {
+ this.spanNearMaxDistance = spanNearMaxDistance;
+ }
+
+ public int getSpanNotNearMaxDistance() {
+ return spanNotNearMaxDistance;
+ }
+
+ public void setSpanNotNearMaxDistance(int spanNotNearMaxDistance) {
+ this.spanNotNearMaxDistance = spanNotNearMaxDistance;
+ }
+
+ /**
+ * If the a term passes through the analyzer and nothing comes out,
+ * throw an exception or silently ignore the missing term. This can
+ * happen with stop words or with other strings that the analyzer
+ * ignores.
+ *
+ *
+ * This is applied only at the full term level.
+ *
+ * Currently, a parseException is thrown no matter the setting on this
+ * whenever an analyzer can't return a value for a multiterm query.
+ *
+ * @return throw exception if analyzer yields empty term
+ */
+ public boolean getThrowExceptionForEmptyTerm() {
+ return throwExceptionForEmptyTerm;
+ }
+
+ /**
+ * @see #getThrowExceptionForEmptyTerm()
+ * @param throwExceptionForEmptyTerm
+ */
+ public void setThrowExceptionForEmptyTerm(boolean throwExceptionForEmptyTerm) {
+ this.throwExceptionForEmptyTerm = throwExceptionForEmptyTerm;
+ }
+
+ protected static boolean isCharEscaped(String s, int i){
+ int j = i;
+ int esc = 0;
+ while (--j >=0 && s.charAt(j) == '\\'){
+ esc++;
+ }
+ if (esc % 2 == 0){
+ return false;
+ }
+ return true;
+ }
+ /**
+ * Copied nearly exactly from FuzzyQuery's floatToEdits.
+ *
+ * There are two differences:
+ *
+ *
+ * - FuzzyQuery's floatToEdits requires that the return value
+ * be <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE
-
+ *
- This adds a small amount so that nearly exact
+ * hits don't get floored: 0.80 for termLen 5 should = 1
+ *
+ * @param minimumSimilarity
+ * @param termLen
+ * @return edits
+ */
+ public static int unboundedFloatToEdits(float minimumSimilarity, int termLen) {
+ if (minimumSimilarity >= 1f) {
+ return (int)minimumSimilarity;
+ } else if (minimumSimilarity == 0.0f) {
+ return 0; // 0 means exact, not infinite # of edits!
+ } else {
+ return (int)(0.00001f+(1f-minimumSimilarity) * termLen);
+ }
+ }
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanOnlyParser.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanOnlyParser.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanOnlyParser.java (revision 0)
@@ -0,0 +1,96 @@
+package org.apache.lucene.queryparser.spans;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.queryparser.classic.CharStream;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.spans.tokens.SQPClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPOrClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPToken;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.util.Version;
+
+/**
+ * This is somewhat of a toy class that enables easy testing of the span only
+ * parsing components. This does not handle boolean operators (AND, NOT, OR, +/-),
+ * and it does not handle multiple fields. It also doesn't handle MatchAllDocsQueries.
+ *
+ * However, it does guarantee that a SpanQuery is returned.
+ *
+ * The functionality of this class was the initial offering in LUCENE-5205.
+ *
+ *
+ * @see SpanQueryParser
+ */
+public class SpanOnlyParser extends AbstractSpanQueryParser{
+
+
+ private static final int MAX_QUERY_LENGTH_CHARS = 30000;
+
+
+ private String topLevelQueryString = "";
+
+ public SpanOnlyParser(Version matchVersion, String f, Analyzer a){
+ init(matchVersion, f, a);
+ }
+
+ public SpanOnlyParser(Version matchVersion, String f, Analyzer a, Analyzer multitermAnalyzer){
+ init(matchVersion, f, a, multitermAnalyzer);
+ }
+
+ @Override
+ public Query parse(String s) throws ParseException{
+ topLevelQueryString = s;
+ Query q = TopLevelQuery(getField());
+ assert(q == null || q instanceof SpanQuery);
+ return q;
+ }
+
+ @Override
+ public void ReInit(CharStream stream) {
+ //this is crazy...convert string to char stream then back to string for processing
+ //the value from extending QueryParserBase was greater than this
+ //bit of craziness.
+ try {
+ int i = 0;
+ while(i++ < MAX_QUERY_LENGTH_CHARS){
+ stream.readChar();
+ }
+ } catch (IOException e) {}
+ topLevelQueryString = stream.GetImage();
+
+ }
+
+ @Override
+ public Query TopLevelQuery(String field) throws ParseException {
+
+ return _parsePureSpan(field, topLevelQueryString);
+ }
+
+
+ protected Query _parsePureSpan(String field, String queryString) throws ParseException{
+ SpanQueryLexer lexer = new SpanQueryLexer();
+ List tokens = lexer.getTokens(topLevelQueryString);
+ SQPClause overallClause = new SQPOrClause(0, tokens.size());
+ return _parsePureSpanClause(tokens, field, overallClause);
+ }
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerminal.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerminal.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerminal.java (revision 0)
@@ -0,0 +1,7 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+//stub interface to gather SQPTerm, SQPRegexTerm and SQPRangeTerm
+//under the same umbrella
+public class SQPTerminal extends SQPBoostableToken {
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBoostableToken.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBoostableToken.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBoostableToken.java (revision 0)
@@ -0,0 +1,53 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+import org.apache.lucene.queryparser.spans.SpanQueryParserBase;
+
+public abstract class SQPBoostableToken implements SQPToken{
+ private float boost = SpanQueryParserBase.UNSPECIFIED_BOOST;
+
+ public void setBoost(float boost){
+ this.boost = boost;
+ }
+
+ public float getBoost(){
+ return boost;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + Float.floatToIntBits(boost);
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (!(obj instanceof SQPBoostableToken)) {
+ return false;
+ }
+ SQPBoostableToken other = (SQPBoostableToken) obj;
+ if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("SQPBoostableToken [boost=");
+ builder.append(boost);
+ builder.append("]");
+ return builder.toString();
+ }
+
+
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPClause.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPClause.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPClause.java (revision 0)
@@ -0,0 +1,71 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+public abstract class SQPClause extends SQPBoostableToken {
+
+ public static enum TYPE { PAREN, BRACKET, QUOTE};
+ private final int tokenOffsetStart;
+ private int tokenOffsetEnd;
+
+ public SQPClause(int tokenOffsetStart){
+ this.tokenOffsetStart = tokenOffsetStart;
+ }
+
+ public SQPClause(int tokenOffsetStart, int tokenOffsetEnd){
+ this(tokenOffsetStart);
+ this.tokenOffsetEnd = tokenOffsetEnd;
+ }
+ public int getTokenOffsetStart(){
+ return tokenOffsetStart;
+ }
+
+ public int getTokenOffsetEnd(){
+ return tokenOffsetEnd;
+ }
+
+ public void setTokenOffsetEnd(int tokenOffsetEnd){
+ this.tokenOffsetEnd = tokenOffsetEnd;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + tokenOffsetStart;
+ result = prime * result + tokenOffsetEnd;
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (!(obj instanceof SQPClause)) {
+ return false;
+ }
+ SQPClause other = (SQPClause) obj;
+ if (tokenOffsetStart != other.tokenOffsetStart) {
+ return false;
+ }
+ if (tokenOffsetEnd != other.tokenOffsetEnd) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("SQPClauseBase [charOffsetStart=");
+ builder.append(tokenOffsetStart);
+ builder.append(", tokenOffsetEnd=");
+ builder.append(tokenOffsetEnd);
+ builder.append("]");
+ return builder.toString();
+ }
+
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOrClause.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOrClause.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOrClause.java (revision 0)
@@ -0,0 +1,61 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+public class SQPOrClause extends SQPClause {
+
+ public static final int DEFAULT_MINIMUM_NUMBER_SHOULD_MATCH = 1;
+
+ private int minimumNumberShouldMatch = DEFAULT_MINIMUM_NUMBER_SHOULD_MATCH;
+
+ public SQPOrClause(int tokenOffsetStart, int tokenOffsetEnd){
+ super(tokenOffsetStart, tokenOffsetEnd);
+ }
+
+ public int getMinimumNumberShouldMatch(){
+ return minimumNumberShouldMatch;
+ }
+
+ public void setMinimumNumberShouldMatch(int n){
+ minimumNumberShouldMatch = n;
+ }
+
+ public TYPE getType(){
+ return TYPE.PAREN;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + minimumNumberShouldMatch;
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (!(obj instanceof SQPOrClause)) {
+ return false;
+ }
+ SQPOrClause other = (SQPOrClause) obj;
+ if (minimumNumberShouldMatch != other.minimumNumberShouldMatch) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("SQPOrClause [minimumNumberShouldMatch=");
+ builder.append(minimumNumberShouldMatch);
+ builder.append("]");
+ return builder.toString();
+ }
+
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBooleanOpToken.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBooleanOpToken.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBooleanOpToken.java (revision 0)
@@ -0,0 +1,70 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+import org.apache.lucene.queryparser.spans.SpanQueryParserBase;
+
+
+
+public class SQPBooleanOpToken implements SQPToken{
+
+ private final int type;
+ public SQPBooleanOpToken(int type){
+ this.type = type;
+ }
+
+ public int getType(){
+ return type;
+ }
+
+ public boolean isConj(){
+ if (type == SpanQueryParserBase.CONJ_AND ||
+ type == SpanQueryParserBase.CONJ_OR){
+ return true;
+ }
+ return false;
+ }
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + type;
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (!(obj instanceof SQPBooleanOpToken)) {
+ return false;
+ }
+ SQPBooleanOpToken other = (SQPBooleanOpToken) obj;
+ if (type != other.type) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("SQPBooleanOpToken [type=");
+ builder.append(type);
+ builder.append("]");
+ return builder.toString();
+ }
+
+ public static boolean isMod(int i) {
+ if (i == SpanQueryParserBase.CONJ_AND ||
+ i == SpanQueryParserBase.CONJ_OR){
+ return false;
+ }
+ return true;
+ }
+
+
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPToken.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPToken.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPToken.java (revision 0)
@@ -0,0 +1,7 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+//stub interface
+public interface SQPToken{
+
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNearClause.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNearClause.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNearClause.java (revision 0)
@@ -0,0 +1,97 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+public class SQPNearClause extends SQPClause{
+
+
+ public static final Boolean UNSPECIFIED_IN_ORDER = null;
+
+
+ private final TYPE type;
+
+ private final Boolean inOrder;
+ private final boolean hasParams;
+ private final int slop;
+
+ public SQPNearClause(int tokenStartOffset, int tokenEndOffset, TYPE type,
+ boolean hasParams, Boolean inOrder, int slop){
+ super(tokenStartOffset, tokenEndOffset);
+ this.type = type;
+ this.hasParams = hasParams;
+ this.inOrder = inOrder;
+ this.slop = slop;
+ }
+
+ public TYPE getType() {
+ return type;
+ }
+
+ public Boolean getInOrder() {
+ return inOrder;
+ }
+
+ public boolean hasParams() {
+ return hasParams;
+ }
+
+ public int getSlop() {
+ return slop;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = super.hashCode();
+ result = prime * result + (hasParams ? 1231 : 1237);
+ result = prime * result + ((inOrder == null) ? 0 : inOrder.hashCode());
+ result = prime * result + slop;
+ result = prime * result + ((type == null) ? 0 : type.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!super.equals(obj)) {
+ return false;
+ }
+ if (!(obj instanceof SQPNearClause)) {
+ return false;
+ }
+ SQPNearClause other = (SQPNearClause) obj;
+ if (hasParams != other.hasParams) {
+ return false;
+ }
+ if (inOrder == null) {
+ if (other.inOrder != null) {
+ return false;
+ }
+ } else if (!inOrder.equals(other.inOrder)) {
+ return false;
+ }
+ if (slop != other.slop) {
+ return false;
+ }
+ if (type != other.type) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("SQPNearClause [type=");
+ builder.append(type);
+ builder.append(", inOrder=");
+ builder.append(inOrder);
+ builder.append(", hasParams=");
+ builder.append(hasParams);
+ builder.append(", slop=");
+ builder.append(slop);
+ builder.append("]");
+ return builder.toString();
+ }
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPField.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPField.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPField.java (revision 0)
@@ -0,0 +1,48 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+public class SQPField implements SQPToken{
+ private final String field;
+
+ public SQPField(String field){
+ this.field = field;
+ }
+
+ public String getField(){
+ return field;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((field == null) ? 0 : field.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (!(obj instanceof SQPField))
+ return false;
+ SQPField other = (SQPField) obj;
+ if (field == null) {
+ if (other.field != null)
+ return false;
+ } else if (!field.equals(other.field))
+ return false;
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("SQPField [field=");
+ builder.append(field);
+ builder.append("]");
+ return builder.toString();
+ }
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRangeTerm.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRangeTerm.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRangeTerm.java (revision 0)
@@ -0,0 +1,94 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+public class SQPRangeTerm extends SQPTerminal{
+ private final String start;
+ private final String end;
+ private final boolean startInclusive;
+ private final boolean endInclusive;
+
+ public SQPRangeTerm(String from, String to, boolean startInclusive, boolean endInclusive){
+ this.start = from;
+ this.end = to;
+ this.startInclusive = startInclusive;
+ this.endInclusive = endInclusive;
+ }
+
+ public String getStart(){
+ return start;
+ }
+
+ public String getEnd(){
+ return end;
+ }
+
+ public boolean getStartInclusive(){
+ return startInclusive;
+ }
+
+ public boolean getEndInclusive(){
+ return endInclusive;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((end == null) ? 0 : end.hashCode());
+ result = prime * result + (endInclusive ? 1231 : 1237);
+ result = prime * result + ((start == null) ? 0 : start.hashCode());
+ result = prime * result + (startInclusive ? 1231 : 1237);
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (!(obj instanceof SQPRangeTerm)) {
+ return false;
+ }
+ SQPRangeTerm other = (SQPRangeTerm) obj;
+ if (end == null) {
+ if (other.end != null) {
+ return false;
+ }
+ } else if (!end.equals(other.end)) {
+ return false;
+ }
+ if (endInclusive != other.endInclusive) {
+ return false;
+ }
+ if (start == null) {
+ if (other.start != null) {
+ return false;
+ }
+ } else if (!start.equals(other.start)) {
+ return false;
+ }
+ if (startInclusive != other.startInclusive) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("SQPRangeTerm [start=");
+ builder.append(start);
+ builder.append(", end=");
+ builder.append(end);
+ builder.append(", startInclusive=");
+ builder.append(startInclusive);
+ builder.append(", endInclusive=");
+ builder.append(endInclusive);
+ builder.append("]");
+ return builder.toString();
+ }
+
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/package.html
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/package.html (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/package.html (revision 0)
@@ -0,0 +1,25 @@
+
+
+
+
+
+Classes primarily used by SpanQueryLexer.
+
+
+
+
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOpenClause.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOpenClause.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOpenClause.java (revision 0)
@@ -0,0 +1,53 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+public class SQPOpenClause extends SQPClause {
+
+
+ private final TYPE type;
+
+ public SQPOpenClause(int startOffset, TYPE type){
+ super(startOffset);
+ this.type = type;
+ }
+
+ public TYPE getType() {
+ return type;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = super.hashCode();
+ result = prime * result + ((type == null) ? 0 : type.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!super.equals(obj)) {
+ return false;
+ }
+ if (!(obj instanceof SQPOpenClause)) {
+ return false;
+ }
+ SQPOpenClause other = (SQPOpenClause) obj;
+ if (type != other.type) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("SQPOpenClause [type=");
+ builder.append(type);
+ builder.append("]");
+ return builder.toString();
+ }
+
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPMatchAllDocsToken.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPMatchAllDocsToken.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPMatchAllDocsToken.java (revision 0)
@@ -0,0 +1,5 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+public class SQPMatchAllDocsToken extends SQPTerminal{
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRegexTerm.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRegexTerm.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRegexTerm.java (revision 0)
@@ -0,0 +1,52 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+public class SQPRegexTerm extends SQPTerminal{
+ private String term;
+ public SQPRegexTerm(String t){
+ this.term = t;
+ }
+
+ public String getString(){
+ return term;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((term == null) ? 0 : term.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (!(obj instanceof SQPRegexTerm)) {
+ return false;
+ }
+ SQPRegexTerm other = (SQPRegexTerm) obj;
+ if (term == null) {
+ if (other.term != null) {
+ return false;
+ }
+ } else if (!term.equals(other.term)) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("SQPRegexTerm [term=");
+ builder.append(term);
+ builder.append("]");
+ return builder.toString();
+ }
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNotNearClause.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNotNearClause.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNotNearClause.java (revision 0)
@@ -0,0 +1,82 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+public class SQPNotNearClause extends SQPClause{
+
+ public static final int NOT_DEFAULT = 0;
+
+ private final TYPE type;
+
+ private final int notPre;
+ private final int notPost;
+
+ public SQPNotNearClause(int tokenStartOffset, int tokenEndOffset, TYPE type,
+ int notPre, int notPost){
+ super(tokenStartOffset, tokenEndOffset);
+ this.type = type;
+ this.notPre = notPre;
+ this.notPost = notPost;
+ }
+
+ public TYPE getType() {
+ return type;
+ }
+
+ public int getNotPre() {
+ return notPre;
+ }
+
+ public int getNotPost() {
+ return notPost;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = super.hashCode();
+ result = prime * result + notPost;
+ result = prime * result + notPre;
+ result = prime * result + ((type == null) ? 0 : type.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!super.equals(obj)) {
+ return false;
+ }
+ if (!(obj instanceof SQPNotNearClause)) {
+ return false;
+ }
+ SQPNotNearClause other = (SQPNotNearClause) obj;
+ if (notPost != other.notPost) {
+ return false;
+ }
+ if (notPre != other.notPre) {
+ return false;
+ }
+ if (type != other.type) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("SQPNotNearClause [type=");
+ builder.append(type);
+ builder.append(", notPre=");
+ builder.append(notPre);
+ builder.append(", notPost=");
+ builder.append(notPost);
+ builder.append("]");
+ builder.append( getTokenOffsetStart() + ": " + getTokenOffsetEnd());
+ return builder.toString();
+ }
+
+
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerm.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerm.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerm.java (revision 0)
@@ -0,0 +1,69 @@
+package org.apache.lucene.queryparser.spans.tokens;
+
+public class SQPTerm extends SQPTerminal{
+ private final String string;
+ private boolean isQuoted = false;
+
+
+ public SQPTerm(String string){
+ this.string = string;
+ }
+
+ public String getString(){
+ return string;
+ }
+
+ public void setIsQuoted(boolean b){
+ isQuoted = b;
+ }
+ public boolean isQuoted(){
+ return isQuoted;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + (isQuoted ? 1231 : 1237);
+ result = prime * result + ((string == null) ? 0 : string.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (!(obj instanceof SQPTerm)) {
+ return false;
+ }
+ SQPTerm other = (SQPTerm) obj;
+ if (isQuoted != other.isQuoted) {
+ return false;
+ }
+ if (string == null) {
+ if (other.string != null) {
+ return false;
+ }
+ } else if (!string.equals(other.string)) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("SQPTerm [string=");
+ builder.append(string);
+ builder.append(", isQuoted=");
+ builder.append(isQuoted);
+ builder.append("]");
+ return builder.toString();
+ }
+
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java (revision 0)
@@ -0,0 +1,444 @@
+package org.apache.lucene.queryparser.spans;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.queryparser.classic.CharStream;
+import org.apache.lucene.queryparser.classic.ParseException;
+
+import org.apache.lucene.queryparser.spans.tokens.SQPBooleanOpToken;
+import org.apache.lucene.queryparser.spans.tokens.SQPBoostableToken;
+import org.apache.lucene.queryparser.spans.tokens.SQPClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPField;
+import org.apache.lucene.queryparser.spans.tokens.SQPMatchAllDocsToken;
+import org.apache.lucene.queryparser.spans.tokens.SQPNearClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPOrClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPTerminal;
+import org.apache.lucene.queryparser.spans.tokens.SQPToken;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.util.Version;
+
+/**
+ * This parser leverages the power of SpanQuery and can combine them with
+ * traditional boolean logic and multiple field information.This parser includes functionality from:
+ *
+ * - {@link org.apache.lucene.queryparser.classic.QueryParser classic QueryParser}: most of its syntax
+ * - {@link org.apache.lucene.queryparser.surround.parser.QueryParser SurroundQueryParser}: recursive parsing for "near" and "not" clauses.
+ * - {@link org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser}:
+ * can handle "near" queries that include multiterms ({@link org.apache.lucene.search.WildcardQuery},
+ * {@link org.apache.lucene.search.FuzzyQuery}, {@link org.apache.lucene.search.RegexpQuery}).
+ * - {@link org.apache.lucene.queryparser.analyzing.AnalyzingQueryParser}: has an option to analyze multiterms.
+ *
+ *
+ *
+ *
+ *
+ * Background
+ * This parser was developed for the concordance/analytic search use case --
+ * the user wants to see every time a span occurs. The basic approach of this parser is to build
+ * BooleanQueries comprised of SpanQueries. Aside from a MatchAllDocsQuery, there should be no other
+ * types of queries.
+ *
+ *
+ *
+ * With luck, this parser will be made obsolete with Lucene-2878, but until then,
+ * this parser fills a niche.
+ *
+ *
+ * One goal was to keep the syntax as close to Lucene's classic
+ * {@link org.apache.lucene.queryparser.classic.QueryParser} as possible.
+ * Another goal was to make analysis of multiterms a fundamental part of the parser
+ * {@link AnalyzingQueryParserBase}.
+ *
+ * Similarities and Differences
+ *
+ * Same as classic syntax:
+ *
+ * - term: test
+ * - fuzzy: roam~0.8, roam~2
+ * - wildcard: te?t, test*, t*st
+ * - regex:
/[mb]oat/
+ * - phrase: "jakarta apache"
+ * - phrase with slop: "jakarta apache"~3
+ * - "or" clauses: jakarta apache
+ * - grouping clauses: (jakarta apache)
+ * - field: author:hatcher title:lucene
+ * - boolean operators: (lucene AND apache) NOT jakarta
+ *
- required/not required operators: +lucene +apache -jakarta
+ * - boolean with field:(author:hatcher AND author:gospodnetic) AND title:lucene
+ *
+ *
+ * Main additions in SpanQueryParser syntax vs. classic syntax:
+ *
+ * - Can require "in order" for phrases with slop with the ~> operator: "jakarta apache"~>3
+ * - Can specify "not near" "bieber fever"!~3,10 ::
+ * find "bieber" but not if "fever" appears within 3 words before or
+ * 10 words after it.
+ * - Fully recursive phrasal queries with [ and ]; as in: [[jakarta apache]~3 lucene]~>4 ::
+ * find "jakarta" within 3 words of "apache", and that hit has to be within four
+ * words before "lucene".
+ * - Can also use [] for single level phrasal queries instead of "" as in: [jakarta apache]
+ * - Can use "or" clauses in phrasal queries: "apache (lucene solr)"~3 ::
+ * find "apache" and then either "lucene" or "solr" within three words.
+ *
+ * - Can use multiterms in phrasal queries: "jakarta~1 ap*che"~2
+ * - Did I mention recursion: [[jakarta~1 ap*che]~2 (solr~ /l[ou]+[cs][en]+/)]~10 ::
+ * Find something like "jakarta" within two words of "ap*che" and that hit
+ * has to be within ten words of something like "solr" or that lucene regex.
+ * - How about: "fever (travlota~2 disco "saturday night" beeber~1)"!~3,10 :: find fever but not if something like
+ * travlota or disco or "saturday night" or something like beeber appears within 3 words before or 10 words after.
+ * - Can require at least x number of hits at boolean level: "apache AND (lucene solr tika)~2
+ *
+ *
+ *
+ * Trivial additions:
+ *
+ * - Can specify prefix length in fuzzy queries: jakarta~1,2 (edit distance=1, prefix=2)
+ * - Can specify prefix Optimal String Alignment (OSA) vs Levenshtein
+ * in fuzzy queries: jakarta~1 (OSA) vs jakarta~>1 (Levenshtein)
+ *
+ *
+ *
+ * Limitations of SpanQueryParser compared with classic QueryParser:
+ *
+ * - There is some learning curve to figure out the subtle differences in syntax between
+ * when one is within a phrase and when not. Including:
+ *
+ * - Boolean operators are not allowed within phrases: "solr (apache AND lucene)".
+ * Consider rewriting:[solr [apache lucene]]
+ * - Field information is not allowed within phrases.
+ * - Minimum hit counts for boolean or queries are not allowed within phrases: [apache (lucene solr tika)~2]
+ *
+ * - This parser is not built with .jj or the antlr parser framework.
+ * Regrettably, because it is generating a {@link org.apache.lucene.search.spans.SpanQuery},
+ * it can't use all of the generalizable queryparser infrastructure that was added with Lucene 4.+.
+ *
+ *
+ * Stop word handling
+ *
+ * The user can choose to throw a {@link org.apache.lucene.queryparser.classic.ParseException} if a stop word is encountered.
+ * If SpanQueryParserBase.throwExceptionForEmptyTerm is set to false (default), the following should happen.
+ *
+ *
+ *
+ * - Term: "the" will return an empty SpanQuery (similar to classic queryparser)
+ * - BooleanOr: (the apache jakarta) will drop the stop word and return a
+ * {@link org.apache.lucene.search.spans.SpanOrQuery} for "apache"
+ * or "jakarta"
+ *
- SpanNear: "apache and jakarta" will drop the "and" and match on only "apache jakarta"
-
+ *
+ * A parse exception is currently always thrown if the parser analyzes a multiterm, and a subcomponent of the
+ * multiterm has a stopword: the*tre
+ *
+ * Expert: Other subtle differences between SpanQueryParser and classic QueryParser.
+ *
+ * - Fuzzy queries with slop > 2 are handled by SlowFuzzyQuery. The developer can set the minFuzzySim to limit
+ * the maximum edit distance (i.e. turn off SlowFuzzyQuery by setting fuzzyMinSim = 2.0f.
+ * - Fuzzy queries with edit distance >=1 are rounded so that an exception is not thrown.
+ *
+ *
+ * Truly Expert: there are a few other very subtle differences that are documented in comments
+ * in the sourcecode in the header of AnalyzingQueryParser.
+ *
+ *
+ * NOTE You must add the sandbox jar to your class path to include
+ * the currently deprecated {@link org.apache.lucene.sandbox.queries.SlowFuzzyQuery}.
+ *
+ *
+ */
+
+public class SpanQueryParser extends AbstractSpanQueryParser {
+
+ /*
+ * Some subtle differences between classic QueryParser and SpanQueryParser
+ *
+ * 1) In a range query, this parser is not removing double quotes.
+ * [ "abc" TO "xyz" ] -> [abc TO xyz] in classic query parser, but remains the same in SpanQueryParser
+ *
+ * 2) The SpanQueryParser does not recognize quotes as a way to escape non-regexes.
+ * In classic syntax a path string of "/abc/def/ghi" is denoted by the double quotes; in
+ * SpanQueryParser, the user has to escape the / as in \/abc\/def\/ghi
+ *
+ * 3) The SpanQueryParser appears to be adding an escape to RangeTermQueries of *, as in:
+ * in classic "[ * TO y]" -> [* TO y]
+ * but in SpanQueryParser:
+ * "[ * TO y]" -> [\* TO y]
+ *
+ * SpanQueryParser's handling of this is the same as creating a new RangeTermQuery.
+ * However, it does return different docs than the query generated by classic.
+ *
+ * 4) "term^3~" is not handled. Boosts must currently come after fuzzy mods in SpanQueryParser.
+ *
+ * 5) SpanQueryParser rounds fuzzy sims that are > 1.0. This test fails: assertParseException("term~1.1")
+ *
+ * 6) SpanQueryParser adds a small amount to its own floatToEdits calculation
+ * so that near exact percentages (e.g. 80% of a 5 char word should yield 1)
+ * aren't floored and therefore miss.
+ *
+ * For SpanQueryParser, brwon~0.80 hits on brown; however, it does
+ * not hit with classic query parser.
+ *
+ * Unfortunately, like classic query parser, SpanQueryParser will fail to parse
+ * a token with an odd number of \ ending in a phrasal boundary (LUCENE-1189).
+ *
+ * The test case that is used in LUCENE-1189 is slightly different than the original
+ * issue: \"(name:[///mike\\\\\\\") or (name:\"alphonse\")";
+ *
+ *
+ */
+ private static final int MAX_QUERY_LENGTH_CHARS = 30000;
+
+ private String topLevelQueryString;
+
+ public SpanQueryParser(Version matchVersion, String f, Analyzer a){
+ init(matchVersion, f, a);
+ }
+
+ public SpanQueryParser(Version matchVersion, String f, Analyzer a, Analyzer multitermAnalyzer){
+ init(matchVersion, f, a, multitermAnalyzer);
+ }
+
+ @Override
+ public void ReInit(CharStream stream) {
+ //this is crazy...convert string to char stream then back to string for processing
+ //the value from extending QueryParserBase was greater than this
+ //bit of craziness.
+ try {
+ int i = 0;
+ while(i++ < MAX_QUERY_LENGTH_CHARS){
+ stream.readChar();
+ }
+ } catch (IOException e) {}
+ topLevelQueryString = stream.GetImage();
+ }
+
+ @Override
+ public Query TopLevelQuery(String field) throws ParseException {
+
+ return _parse(field);
+ }
+
+ @Override
+ public Query parse(String s) throws ParseException {
+ topLevelQueryString = s;
+ return TopLevelQuery(getField());
+ }
+
+ private Query _parse(String field) throws ParseException {
+ if (topLevelQueryString == null || topLevelQueryString.equals("")){
+ return getEmptySpanQuery();
+ }
+ SpanQueryLexer lexer = new SpanQueryLexer();
+ List tokens = lexer.getTokens(topLevelQueryString);
+ //just for debugging
+ for (int i = 0; i < tokens.size(); i++){
+ SQPToken t = tokens.get(i);
+ String end = "";
+ Float boost = Float.NaN;
+ if (t instanceof SQPClause){
+ end = Integer.toString(((SQPClause)t).getTokenOffsetEnd());
+ }
+ if (t instanceof SQPBoostableToken){
+ boost = ((SQPBoostableToken)t).getBoost();
+ }
+ }
+ SQPClause overallClause = new SQPOrClause(0, tokens.size());
+ return parseRecursively(tokens, field, overallClause);
+ }
+
+ private Query parseRecursively(final List tokens,
+ String field, SQPClause clause)
+ throws ParseException{
+ int start = clause.getTokenOffsetStart();
+ int end = clause.getTokenOffsetEnd();
+ testStartEnd(tokens, start, end);
+ List clauses = new ArrayList();
+ int conj = CONJ_NONE;
+ int mods = MOD_NONE;
+ String tmpField = field;
+ int i = start;
+ while (i < end){
+ Query q = null;
+ SQPToken token = tokens.get(i);
+
+ //if boolean operator or field, update local buffers and continue
+ if (token instanceof SQPBooleanOpToken){
+ SQPBooleanOpToken t = (SQPBooleanOpToken)token;
+ if (t.isConj()){
+ conj = t.getType();
+ mods = MOD_NONE;
+ } else {
+ mods = t.getType();
+ }
+ i++;
+ continue;
+ } else if (token instanceof SQPField){
+ tmpField = ((SQPField)token).getField();
+ i++;
+ continue;
+ }
+ //if or clause, recur through tokens
+ if (token instanceof SQPOrClause){
+ //recur!
+ SQPOrClause tmpOr = (SQPOrClause)token;
+ q = parseRecursively(tokens, tmpField, tmpOr);
+
+ if (q instanceof BooleanQuery && tmpOr.getMinimumNumberShouldMatch() > 1){
+ ((BooleanQuery)q).setMinimumNumberShouldMatch(tmpOr.getMinimumNumberShouldMatch());
+ }
+
+ if (q.getBoost() == 1.0f
+ && tmpOr.getBoost() != SpanQueryParserBase.UNSPECIFIED_BOOST){
+ q.setBoost(tmpOr.getBoost());
+ }
+ i = tmpOr.getTokenOffsetEnd();
+ } else if (token instanceof SQPNearClause){
+ SQPNearClause tmpNear = (SQPNearClause)token;
+ q = _parsePureSpanClause(tokens, tmpField, tmpNear);
+ i = tmpNear.getTokenOffsetEnd();
+
+ } else if (token instanceof SQPMatchAllDocsToken){
+ //order matters SQPMatchAllDocs must be before terminal
+ q = new MatchAllDocsQuery();
+ i++;
+ } else if (token instanceof SQPTerminal){
+ SQPTerminal tmpTerm = (SQPTerminal)token;
+ q = buildTerminal(tmpField, tmpTerm);
+ i++;
+ }
+ if (! isEmptyQuery(q)){
+ addClause(clauses, conj, mods, q);
+ }
+ //reset mods and conj and field
+ mods = MOD_NONE;
+ conj = CONJ_NONE;
+ tmpField = field;
+ }
+
+ if (clauses.size() == 0){
+ return getEmptySpanQuery();
+ }
+ if (clauses.size() == 1 &&
+ clauses.get(0).getOccur() != Occur.MUST_NOT){
+ return clauses.get(0).getQuery();
+ }
+
+ BooleanQuery bq = new BooleanQuery();
+ try{
+
+ for (BooleanClause bc : clauses){
+ bq.add(bc);
+ }
+ } catch (BooleanQuery.TooManyClauses e){
+ throw new ParseException(e.getMessage());
+ }
+
+ if (clause instanceof SQPOrClause){
+ SQPOrClause tmpClause = (SQPOrClause)clause;
+ if (tmpClause.getMinimumNumberShouldMatch() > SQPOrClause.DEFAULT_MINIMUM_NUMBER_SHOULD_MATCH){
+ bq.setMinimumNumberShouldMatch(tmpClause.getMinimumNumberShouldMatch());
+ }
+ }
+
+ return bq;
+ }
+
+
+ private void testStartEnd(List tokens, int start, int end)
+ throws ParseException {
+
+ SQPToken s = tokens.get(start);
+ if (s instanceof SQPBooleanOpToken){
+ int type = ((SQPBooleanOpToken)s).getType();
+ if ( type == CONJ_AND || type == CONJ_OR){
+ throw new ParseException("Can't start clause with AND or OR");
+ }
+ }
+
+ SQPToken e = tokens.get(end-1);
+
+ if (e instanceof SQPField){
+ throw new ParseException("Can't end clause with a field token");
+ }
+ if (e instanceof SQPBooleanOpToken){
+ throw new ParseException("Can't end clause with a boolean operator");
+ }
+
+
+ }
+
+
+ /**
+ * Extracts the spans from the BooleanQueries that are not in Occur.NOT
+ * clauses for highlighting. This query should not be used for document retrieval
+ * and may return different documents than "parse."
+ *
+ * @param field
+ * @param queryString
+ * @return SpanQuery for highlighting
+ * @throws ParseException
+ */
+ public SpanQuery getHighlightQuery(String field, String queryString) throws ParseException{
+ Query q = parse(queryString);
+ List sqs = new ArrayList();
+ extractSpanQueries(field, q, sqs);
+ return buildSpanOrQuery(sqs);
+ }
+
+ /**
+ * Takes a query generated by this parser and extracts all
+ * SpanQueries into sqs that are not in a Boolean.Occur.NOT clause
+ * and that match the given field.
+ *
+ * The Query must consist of only BooleanQuery and SpanQuery objects!!!
+ * @param field
+ * @param query
+ * @param sqs
+ */
+ private void extractSpanQueries(String field, Query query, List sqs) {
+ if (query == null){
+ return;
+ }
+ if (query instanceof SpanQuery){
+ SpanQuery sq = (SpanQuery)query;
+ if (! isEmptyQuery(sq) &&
+ sq.getField().equals(field)){
+ sqs.add((SpanQuery)query);
+ }
+
+ } else if (query instanceof BooleanQuery){
+ BooleanQuery bq = (BooleanQuery)query;
+ BooleanClause[] clauses = bq.getClauses();
+ for (BooleanClause clause : clauses){
+ if (clause.getOccur() != Occur.MUST_NOT){
+ extractSpanQueries(field, clause.getQuery(), sqs);
+ }
+ }
+ } else {
+ //ignore
+ }
+ }
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryLexer.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryLexer.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryLexer.java (revision 0)
@@ -0,0 +1,626 @@
+package org.apache.lucene.queryparser.spans;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Stack;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.spans.tokens.SQPBooleanOpToken;
+import org.apache.lucene.queryparser.spans.tokens.SQPBoostableToken;
+import org.apache.lucene.queryparser.spans.tokens.SQPClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPClause.TYPE;
+import org.apache.lucene.queryparser.spans.tokens.SQPField;
+import org.apache.lucene.queryparser.spans.tokens.SQPMatchAllDocsToken;
+import org.apache.lucene.queryparser.spans.tokens.SQPNearClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPNotNearClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPOpenClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPOrClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPRangeTerm;
+import org.apache.lucene.queryparser.spans.tokens.SQPRegexTerm;
+import org.apache.lucene.queryparser.spans.tokens.SQPTerm;
+import org.apache.lucene.queryparser.spans.tokens.SQPTerminal;
+import org.apache.lucene.queryparser.spans.tokens.SQPToken;
+import org.apache.lucene.util.mutable.MutableValueInt;
+
+/**
+ * Tokenizer that returns a list of tokens of types:
+ * Term, RegexTerm, RangeTerm
+ * Boolean AND, NOT, etc
+ * Field
+ *
+ * A clause is represented as a node in the list where the clause started.
+ * The clause includes offsets into the list for where its contents start and end.
+ *
+ * Unescapes field and boolean operator tokens, but nothing else
+ *
+ *
+ * Identifies the following types of exceptions:
+ * mismatching/unmatched () "" []
+ * bad unicode escape sequences
+ * some illegal conj and mods (and and)
+ * bad boosts: term^0.6^2
+ *
+ *
+ *
+ */
+public class SpanQueryLexer {
+
+ private final static String AND = "AND";
+ private final static String NOT = "NOT";
+ private final static String OR = "OR";//silently removed from queries...beware!
+
+ private final static int DEFAULT_MIN_REQUIRED_IN_OR = 2;
+
+ // private final static Pattern UNESCAPE = Pattern.compile("\\\\([-+\\p{Z}:\\\\\\(\\)\\[\\]])");
+ private final static Pattern UNESCAPE_REGEX = Pattern.compile("\\\\(.)");
+
+
+ private final static String BOOST = "\\^((?:\\d*\\.)?\\d+)";
+ private final static Pattern TERM_BOOST = Pattern.compile("^(?:(?:\\\\.)|(?:[^\\\\]))*?(?:"+BOOST+")?$");
+
+ private final static String OPEN_PAREN = "(";
+ private final static String OPEN_BRACKET = "[";
+ private final static String CLOSE_BRACKET = "]";
+ private final static String DQUOTE = "\"";
+
+ //Groups
+ public enum G {
+ WHOLE,
+ ESCAPE,
+ MATCH_ALL_DOCS,
+ SPACE,
+ FIELD,
+ PLUS_MINUS,
+ RANGE_START,
+ RANGE_TERM_FROM,
+ RANGE_TERM_TO,
+ RANGE_END,
+ REGEX,
+ CLOSE_PAREN,
+ CLOSE_PAREN_DIGITS,
+ QUOTE_OR_CLOSING_BRACKET,
+ NEAR_PARAM,
+ NEAR_IN_ORDER,
+ NEAR_SLOP,
+ NOT_NEAR_PRE,
+ NOT_NEAR_POST,
+ BOOST,
+ OPEN_PAREN_OR_BRACKET,
+
+
+ };
+
+ //using \\p{Z} to capture wider variety of Unicode whitespace than \\s
+ //DO NOT DO THIS!!! Blew heap when a string had a character beyond bmp.
+ // private final static String TERMINAL_STRING =
+ //"((?:\\\\.|(?:[-+](?![/\\(\\[\"]))|[^-+\\\\\\(\\)\\[\\]\\p{Z}\"/:\\^])+)(?:(:)|"+BOOST+")?";
+
+ private final static Pattern ILLEGAL_END = Pattern.compile("^((?:\\\\.)|[^\\\\])*\\\\$");
+ private final static Pattern ILLEGAL_UNICODE_ESCAPE = Pattern.compile("\\\\u([0-9a-fA-F]{0,4})");//{0,4})");
+
+ private final static String ESCAPE_STRING = "((?:\\\\.)+)";
+ private final static String SPACE_STRING = "(\\p{Z}+)";
+ private final static String MATCH_ALL_DOCS_STRING = "(\\*:\\*)";
+ private final static String FIELD_END_STRING = ("(:)");
+ private final static String REGEX_STRING = "(?:/((?:\\\\.|[^/\\\\])+?)/)";
+
+ private final static String RANGE_TERM = "((?:\\\\.|[^\\p{Z}\\(\\)\\[\\]{}])+)";
+ private final static String RANGE_START_DELIMITER = "([\\[{])";
+ private final static String RANGE_END_DELIMITER = "([\\]}])";
+
+ private final static String RANGE_STRING = "(?:"+RANGE_START_DELIMITER+
+ "\\p{Z}*"+RANGE_TERM+"\\p{Z}+TO\\p{Z}+"+RANGE_TERM+"\\p{Z}*"+RANGE_END_DELIMITER+"(?!(?:~|!~)))";
+ //plus/minus must not be followed by a space (to be boolean op)
+ private final static String PLUS_MINUS_STRING = "(?:([-+])(?!\\p{Z}))";
+ private final static String OPENING = "([\\(\\[])";
+
+ private final static String NEAR_MODIFIERS = "~(?:(>)?(\\d+)?)?";
+ private final static String NOT_NEAR_MODIFIERS = "!~(?:(\\d+)(?:,(\\d+))?)?";
+
+ private final static String NEAR_CLOSING_MODIFIERS = "("+NEAR_MODIFIERS+"|"+NOT_NEAR_MODIFIERS+")?";
+
+ private final static String OR_CLOSING_MODIFIER = "(?:~(\\d*))?";
+ private final static String CLOSING_STRING = "(?:(\\))"+OR_CLOSING_MODIFIER+")|(?:([\\]\"])"+NEAR_CLOSING_MODIFIERS+")";
+
+ private final static Pattern TOKENIZER = Pattern.compile(
+ //TERMINAL_STRING +"|"+
+ ESCAPE_STRING + "|" + MATCH_ALL_DOCS_STRING + "|" + SPACE_STRING + "|"+
+ FIELD_END_STRING+"|"+
+ PLUS_MINUS_STRING+
+ "|(?:"+RANGE_STRING+"|"+
+ REGEX_STRING +"|"+CLOSING_STRING + ")(?:"+BOOST+")?|"+OPENING);
+
+ public List getTokens(String s) throws ParseException{
+ Matcher m = ILLEGAL_END.matcher(s);
+ if (m.find()){
+ throw new ParseException("Can't end query with unescaped backslash character");
+ }
+ m = ILLEGAL_UNICODE_ESCAPE.matcher(s);
+ while (m.find()){
+ if (m.group(1).length() != 4){
+ throw new ParseException ("Illegal escaped unicode character: "+m.group(1));
+ }
+ }
+ List tokens = new ArrayList();
+
+ Stack stack = new Stack();
+ MutableValueInt nearDepth = new MutableValueInt();
+ nearDepth.value = 0;
+
+ m = TOKENIZER.matcher(s);
+
+ int last = 0;
+ while (m.find()){
+
+
+ if (m.group(G.SPACE.ordinal()) != null){
+ //space
+ if (m.start() > last){
+ String term = s.substring(last, m.start());
+ addRawTerm(term, nearDepth.value, tokens);
+ }
+ last = m.end();
+ } else if (m.group(G.ESCAPE.ordinal()) != null){
+ //don't set last; keep going
+ } else if (m.group(G.FIELD.ordinal()) != null){
+ if (m.start() > 0 && m.start() > last){
+ String term = s.substring(last, m.start());
+ addField(term, nearDepth.value, tokens);
+ last = m.end();
+ }
+
+ } else if (m.group(G.MATCH_ALL_DOCS.ordinal()) != null){
+ tokens.add(new SQPMatchAllDocsToken());
+ last = m.end();
+ } else {
+
+ if (m.start() > last){
+ String term = s.substring(last, m.start());
+ addRawTerm(term, nearDepth.value, tokens);
+ }
+ addOpTokens(m, tokens, stack, nearDepth);
+
+ last = m.end();
+ }
+
+ }
+ if (last < s.length()){
+ String term = s.substring(last);
+ addRawTerm(term, nearDepth.value, tokens);
+ }
+
+ if (stack.size() != 0){
+ //TODO add more info
+ throw new ParseException("unmatched bracket");
+ } else if (nearDepth.value != 0){
+ throw new ParseException("error in nearDepth calc");
+ }
+
+ testSingle(tokens);
+ return tokens;
+ }
+
+
+ private void addOpTokens(Matcher m,
+ List tokens, Stack stack, MutableValueInt nearDepth)
+ throws ParseException{
+
+ //these return early
+ //perhaps rearrange to more closely align with operator frequency
+ if (m.group(G.CLOSE_PAREN.ordinal()) != null){
+ processCloseParen(m, tokens, stack, nearDepth.value);
+ return;
+ } else if (m.group(G.QUOTE_OR_CLOSING_BRACKET.ordinal()) != null){
+ processCloseBracketOrQuote(m, tokens, stack, nearDepth);
+ return;
+ }
+
+ SQPToken token = null;
+
+ if (m.group(G.OPEN_PAREN_OR_BRACKET.ordinal()) != null){
+ //open paren or open bracket
+ if (m.group(G.OPEN_PAREN_OR_BRACKET.ordinal()).equals(OPEN_PAREN)){
+ token = new SQPOpenClause(tokens.size(), TYPE.PAREN);
+ } else if (m.group(G.OPEN_PAREN_OR_BRACKET.ordinal()).equals(OPEN_BRACKET)){
+ token = new SQPOpenClause(tokens.size(), TYPE.BRACKET);
+ nearDepth.value++;
+ } else {
+ //should never happen !!!
+ }
+ stack.push((SQPOpenClause)token);
+ } else if (m.group(G.PLUS_MINUS.ordinal()) != null){
+ String pm = m.group(G.PLUS_MINUS.ordinal());
+ if (pm.equals("+")){
+ token = new SQPBooleanOpToken(SpanQueryParserBase.MOD_REQ);
+ testBooleanTokens(tokens, (SQPBooleanOpToken)token);
+ } else if (pm.equals("-")){
+ token = new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT);
+ testBooleanTokens(tokens, (SQPBooleanOpToken)token);
+ }
+ } else if (m.group(G.REGEX.ordinal()) != null){
+ token = new SQPRegexTerm(unescapeRegex(m.group(G.REGEX.ordinal())));
+ } else if (m.group(G.RANGE_TERM_FROM.ordinal()) != null){
+ boolean startInclusive = false;
+ boolean endInclusive = false;
+ if (m.group(G.RANGE_START.ordinal()).equals(OPEN_BRACKET)){
+ startInclusive = true;
+ }
+ if (m.group(G.RANGE_END.ordinal()).equals(CLOSE_BRACKET)){
+ endInclusive = true;
+ }
+
+ token = new SQPRangeTerm(m.group(G.RANGE_TERM_FROM.ordinal()), m.group(G.RANGE_TERM_TO.ordinal()),
+ startInclusive, endInclusive);
+ }
+
+ if (token != null){
+ tryToSetBoost(token, m.group(G.BOOST.ordinal()));
+ tokens.add(token);
+ }
+ }
+
+ private void processCloseBracketOrQuote(Matcher m, List tokens,
+ Stack stack, MutableValueInt nearDepth) throws ParseException {
+ //open or close quote or closing bracket
+ //let's start with quote
+ if (m.group(G.QUOTE_OR_CLOSING_BRACKET.ordinal()).equals(DQUOTE)){
+ processDQuote(m, tokens, stack, nearDepth);
+ return;
+ }
+ //from here on out, must be close bracket
+ //test for mismatched
+ if (stack.isEmpty()){
+ throw new ParseException("Couldn't find matching open bracket/quote.");
+ }
+
+ SQPOpenClause open = stack.pop();
+ if (open.getType() != TYPE.BRACKET){
+ //TODO: improve error message
+ throw new ParseException("Was expecting matching bracket!");
+ }
+
+ SQPClause clause = buildNearOrNotNear(m, tokens, open);
+
+ tryToSetBoost(open, m.group(G.BOOST.ordinal()));
+ nearDepth.value--;
+ tokens.set(open.getTokenOffsetStart(), clause);
+ }
+
+
+ private void processDQuote(Matcher m, List tokens,
+ Stack stack, MutableValueInt nearDepth) throws ParseException{
+ //If a double-quote, don't know if open or closing yet
+ //first test to see if there's a matching open quote on the stack
+ //if there is, this must be a closing quote
+ //if there isn't, push whatever was on the stack back and
+ //treat this as an opening quote
+ if (stack.size() > 0){
+ SQPOpenClause openCand = stack.pop();
+ if (openCand.getType() == TYPE.QUOTE){
+ processDQuoteClose(m, tokens, openCand, nearDepth);
+ return;
+
+ }
+ //put candidate back on the stack
+ stack.push(openCand);
+ }
+ //by this point, we know that this double quote must be an opener
+ SQPOpenClause token = new SQPOpenClause(tokens.size(), TYPE.QUOTE);
+
+ stack.push(token);
+ nearDepth.value++;
+ tokens.add(token);
+ }
+
+ private void processDQuoteClose(Matcher m, List tokens,
+ SQPOpenClause open, MutableValueInt nearDepth) throws ParseException{
+ SQPClause clause = buildNearOrNotNear(m, tokens, open);
+ //special handling if a single term between double quotes
+ //and the double quotes don't have any parameters
+ if (clause instanceof SQPNearClause &&
+ ! ((SQPNearClause)clause).hasParams() &&
+ open.getTokenOffsetStart() == tokens.size()-2 &&
+ tokens.size()-2 >=0){
+ boolean abort = false;
+ SQPToken content = tokens.get(tokens.size()-1);
+ if (content instanceof SQPRegexTerm){
+ //add back in the original / and /
+ content = new SQPTerm(escapeDQuote("/"+((SQPRegexTerm)content).getString())+"/");
+ } else if (content instanceof SQPTerm){
+ content = new SQPTerm(escapeDQuote(((SQPTerm)content).getString()));
+ } else {
+ abort = true;
+ }
+ if (abort == false){
+ //remove the last content token
+ tokens.remove(tokens.size()-1);
+ //remove the opening clause marker
+ tokens.remove(tokens.size()-1);
+ tokens.add(content);
+ if (clause.getBoost() != SpanQueryParserBase.UNSPECIFIED_BOOST &&
+ ((SQPBoostableToken)content).getBoost() == SpanQueryParserBase.UNSPECIFIED_BOOST){
+ ((SQPBoostableToken)content).setBoost(clause.getBoost());
+ }
+ ((SQPTerm)content).setIsQuoted(true);
+ nearDepth.value--;
+ return;
+ }
+ }
+
+ nearDepth.value--;
+ tokens.set(open.getTokenOffsetStart(), clause);
+ }
+
+
+ private SQPClause buildNearOrNotNear(Matcher m, List tokens, SQPOpenClause open)
+ throws ParseException{
+ //try for not near first, return early
+ if (m.group(G.NEAR_PARAM.ordinal()) != null && m.group(G.NEAR_PARAM.ordinal()).startsWith("!")){
+ int notPre = SQPNotNearClause.NOT_DEFAULT;
+ int notPost = SQPNotNearClause.NOT_DEFAULT;
+ if (m.group(G.NOT_NEAR_PRE.ordinal()) != null){
+ notPre = Integer.parseInt(m.group(G.NOT_NEAR_PRE.ordinal()));
+ notPost = notPre;
+ }
+ if (m.group(G.NOT_NEAR_POST.ordinal()) != null){
+ notPost = Integer.parseInt(m.group(G.NOT_NEAR_POST.ordinal()));
+ }
+ //contents of this clause start at 1 after tokenOffsetStart
+ SQPNotNearClause clause = new SQPNotNearClause(open.getTokenOffsetStart()+1, tokens.size(),
+ open.getType(), notPre, notPost);
+ tryToSetBoost((SQPBoostableToken)clause, m.group(G.BOOST.ordinal()));
+ return clause;
+ }
+
+ //must be near
+ //if nothing is specified, inOrder == true
+ Boolean inOrder = SQPNearClause.UNSPECIFIED_IN_ORDER;
+ int slop = AbstractSpanQueryParser.UNSPECIFIED_SLOP;
+ boolean hasParams = false;
+ if (m.group(G.NEAR_PARAM.ordinal()) != null){
+ hasParams = true;
+ inOrder = new Boolean(false);
+ }
+
+ if (m.group(G.NEAR_SLOP.ordinal()) != null){
+ slop = Integer.parseInt(m.group(G.NEAR_SLOP.ordinal()));
+ }
+
+ if (m.group(G.NEAR_IN_ORDER.ordinal()) != null){
+ inOrder = new Boolean(true);
+ }
+ SQPNearClause clause = new SQPNearClause(open.getTokenOffsetStart()+1, tokens.size(),
+ open.getType(), hasParams, inOrder, slop);
+ tryToSetBoost((SQPBoostableToken)clause, m.group(G.BOOST.ordinal()));
+ return clause;
+ }
+
+ private void tryToSetBoost(SQPToken open, String boostString) throws ParseException{
+ if (boostString == null || boostString.length() == 0){
+ return;
+ }
+
+ if (open instanceof SQPBoostableToken){
+ try{
+ float b = Float.parseFloat(boostString);
+ ((SQPBoostableToken)open).setBoost(b);
+ } catch (NumberFormatException e){
+ //if the regex works properly, this shoudl never happen
+ throw new ParseException("Unable to parse number in boost: " + boostString);
+ }
+ }
+ }
+
+
+
+ private void processCloseParen(Matcher m, List tokens,
+ Stack stack, int nearDepth) throws ParseException {
+ if (stack.isEmpty()){
+ throw new ParseException("Mismatched closing paren");
+ }
+ SQPOpenClause openCand = stack.pop();
+ if (openCand.getType() == TYPE.PAREN){
+ SQPOrClause clause = new SQPOrClause(openCand.getTokenOffsetStart()+1,
+ tokens.size());
+ if (m.group(G.CLOSE_PAREN_DIGITS.ordinal()) != null){
+ throwIfNear(nearDepth,
+ "Can't specify minimum number of terms for an 'or' clause within a 'near' clause");
+
+ if (m.group(G.CLOSE_PAREN_DIGITS.ordinal()).length() > 0){
+ clause.setMinimumNumberShouldMatch(Integer.parseInt(m.group(G.CLOSE_PAREN_DIGITS.ordinal())));
+ } else {
+ clause.setMinimumNumberShouldMatch(DEFAULT_MIN_REQUIRED_IN_OR);
+ }
+ }
+ tryToSetBoost(clause, m.group(G.BOOST.ordinal()));
+ tokens.set(openCand.getTokenOffsetStart(), clause);
+ return;
+ }
+ throw new ParseException("Was expecting \")\" but found " + openCand.getType());
+ }
+
+ private void throwIfNear(int nearDepth, String string) throws ParseException{
+ if (nearDepth != 0){
+ throw new ParseException(string);
+ }
+ }
+
+ private void addField(String term, int nearDepth, List tokens) throws ParseException{
+ if (nearDepth != 0){
+ throw new ParseException("Can't specify a field within a \"Near\" clause");
+ }
+ if (tokens.size() > 0 && tokens.get(tokens.size()-1) instanceof SQPField){
+ throw new ParseException("A field must contain a terminal");
+ }
+ SQPToken token = new SQPField(SpanQueryParserBase.unescape(term));
+ tokens.add(token);
+ }
+
+ private void addRawTerm(String term, int nearDepth, List tokens)
+ throws ParseException{
+ //The regex over-captures on a term...Term could be:
+ //AND or NOT boolean operator; and term could have boost
+
+ //does the term == AND or NOT or OR
+ if (nearDepth == 0){
+ SQPToken token = null;
+ if (term.equals(AND)){
+ token = new SQPBooleanOpToken(SpanQueryParserBase.CONJ_AND);
+ } else if (term.equals(NOT)){
+ token = new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT);
+ } else if (term.equals(OR)){
+ token = new SQPBooleanOpToken(SpanQueryParserBase.CONJ_OR);
+ }
+ if (token != null){
+ testBooleanTokens(tokens, (SQPBooleanOpToken)token);
+ tokens.add(token);
+ return;
+ }
+
+ }
+ SQPToken token = null;
+ Matcher m = TERM_BOOST.matcher(term);
+
+ int boosts = 0;
+ while (m.find()){
+
+ if (m.group(1) != null){
+ token = new SQPTerm(unescape(term.substring(0, m.start(1)-1)));
+ tryToSetBoost((SQPBoostableToken)token, m.group(1));
+ boosts++;
+ if (m.start(1) == 1 && m.end(1) == term.length()){
+ throw new ParseException("Can't have a boost as a standalone term");
+ }
+ }
+
+ }
+ if (boosts > 1){
+ throw new ParseException("Can't have more than one boost on a term");
+ }
+ if (token == null){
+ token = new SQPTerm(unescape(term));
+ }
+
+
+ tokens.add(token);
+ }
+
+ /**
+ * Test whether this token can be added to the list of tokens
+ * based on classic queryparser rules
+ * @param tokens
+ * @param token
+ * @throws ParseException
+ */
+ private void testBooleanTokens(List tokens, SQPBooleanOpToken token)
+ throws ParseException {
+ //there are possible exceptions with tokens.size()==0, but they
+ //are the same exceptions as at clause beginning.
+ //Need to test elsewhere for start of clause issues.
+ if (tokens.size() == 0){
+ return;
+ }
+ SQPToken t = tokens.get(tokens.size()-1);
+ if (t instanceof SQPBooleanOpToken){
+ int curr = ((SQPBooleanOpToken)t).getType();
+ int nxt = token.getType();
+ boolean ex = false;
+ if (SQPBooleanOpToken.isMod(curr)){
+ ex = true;
+ } else if ( curr == SpanQueryParser.CONJ_AND &&
+ nxt == SpanQueryParser.CONJ_AND){
+ ex = true;
+ } else if( curr == SpanQueryParser.CONJ_OR &&
+ ! SQPBooleanOpToken.isMod(nxt) ){
+ ex = true;
+ } else if (curr == SpanQueryParser.MOD_NOT){
+ ex = true;
+ }
+ if (ex == true){
+ throw new ParseException("Illegal combination of boolean conjunctions and modifiers");
+ }
+ }
+ }
+
+ private void testSingle(List tokens) throws ParseException{
+ if (tokens.size() == 0){
+ return;
+ }
+ if (tokens.size() == 1){
+ SQPToken t = tokens.get(0);
+ if (t instanceof SQPTerminal){
+ } else {
+ throw new ParseException("Must have at least one terminal");
+ }
+ }
+ }
+
+ private String unescape(String s){
+ if (s.equals("\\AND")){
+ return "AND";
+ }
+ if (s.equals("\\NOT")){
+ return "NOT";
+ }
+ if (s.equals("\\OR")){
+ return "OR";
+ }
+ return s;
+ }
+
+ private String unescapeRegex(String s){
+
+ Matcher m = UNESCAPE_REGEX.matcher(s);
+ StringBuilder sb = new StringBuilder();
+ int last = 0;
+ while (m.find()){
+ sb.append(s.substring(last, m.start(0)));
+ if (m.group(1).equals("/")){
+ sb.append("/");
+ } else {
+ sb.append("\\").append(m.group(1));
+ }
+
+ last = m.end(1);
+ }
+ if (last == 0){
+ return s;
+ }
+ sb.append(s.substring(last));
+ return sb.toString();
+ }
+
+ private String escapeDQuote(String s) {
+ //copied from escape in QueryParserBase. Had to remove \\
+ //to handle quoted single terms.
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ // These characters are part of the query syntax and must be escaped
+ if (c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':'
+ || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~'
+ || c == '*' || c == '?' || c == '|' || c == '&' || c == '/') {
+ sb.append('\\');
+ }
+ sb.append(c);
+ }
+ return sb.toString();
+ }
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AbstractSpanQueryParser.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AbstractSpanQueryParser.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AbstractSpanQueryParser.java (revision 0)
@@ -0,0 +1,183 @@
+package org.apache.lucene.queryparser.spans;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.queryparser.classic.ParseException;
+
+import org.apache.lucene.queryparser.spans.tokens.SQPBoostableToken;
+import org.apache.lucene.queryparser.spans.tokens.SQPClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPNearClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPNotNearClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPOrClause;
+import org.apache.lucene.queryparser.spans.tokens.SQPRangeTerm;
+import org.apache.lucene.queryparser.spans.tokens.SQPRegexTerm;
+import org.apache.lucene.queryparser.spans.tokens.SQPTerm;
+import org.apache.lucene.queryparser.spans.tokens.SQPTerminal;
+import org.apache.lucene.queryparser.spans.tokens.SQPToken;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+
+public abstract class AbstractSpanQueryParser extends SpanQueryParserBase {
+
+ @Override
+ abstract public Query parse(String s) throws ParseException;
+
+
+ /**
+ *Recursively called to parse a span query
+ *
+ * This assumes that there are no FIELD tokens and no BOOLEAN operators
+ * @param tokens
+ * @param field
+ * @param parentClause
+ * @return SpanQuery
+ * @throws ParseException
+ */
+ protected SpanQuery _parsePureSpanClause(final List tokens,
+ String field, SQPClause parentClause)
+ throws ParseException{
+
+ int start = parentClause.getTokenOffsetStart();
+ int end = parentClause.getTokenOffsetEnd();
+ if (end-start == 1){
+
+ if (parentClause instanceof SQPNearClause){
+ SQPNearClause nc = (SQPNearClause)parentClause;
+ SQPToken t = tokens.get(start);
+ if (t instanceof SQPTerm){
+
+ SpanQuery ret = trySpecialHandlingForSpanNearWithOneComponent(field, (SQPTerm)t, nc);
+ if (ret != null){
+ if (parentClause.getBoost() != SpanQueryParserBase.UNSPECIFIED_BOOST){
+ ret.setBoost(parentClause.getBoost());
+ }
+ return ret;
+ }
+ }
+ }
+ }
+
+ List queries = new ArrayList();
+ int i = start;
+ while (i < end){
+ SQPToken t = tokens.get(i);
+ SpanQuery q = null;
+ if (t instanceof SQPClause){
+ SQPClause c = (SQPClause)t;
+ q = _parsePureSpanClause(tokens, field, c);
+ i = c.getTokenOffsetEnd();
+ } else if (t instanceof SQPTerminal){
+ q = buildTerminal(field, (SQPTerminal)t);
+ i++;
+ } else {
+ throw new ParseException("Can't process field, boolean operators or a match all docs query in a pure span.");
+ }
+ if (q != null){
+ queries.add(q);
+ }
+ }
+ if (queries == null || queries.size() == 0){
+ return getEmptySpanQuery();
+ }
+ return buildSpanQueryClause(queries, parentClause);
+ }
+
+
+ private SpanQuery trySpecialHandlingForSpanNearWithOneComponent(String field,
+ SQPTerm token, SQPNearClause clause)
+ throws ParseException{
+
+ int slop = (clause.getSlop() == SpanQueryParserBase.UNSPECIFIED_SLOP) ? getPhraseSlop() : clause.getSlop();
+ boolean order = clause.getInOrder() == null ? true : clause.getInOrder().booleanValue();
+
+ SpanQuery ret = (SpanQuery)specialHandlingForSpanNearWithOneComponent(field,
+ token.getString(), slop, order);
+ return ret;
+
+ }
+
+ protected SpanQuery buildTerminal(String field, SQPTerminal token) throws ParseException{
+
+
+ Query q = null;
+ if (token instanceof SQPRegexTerm){
+ q = getRegexpQuery(field, ((SQPRegexTerm)token).getString());
+ } else if (token instanceof SQPTerm){
+ q = buildAnySingleTermQuery(field, ((SQPTerm)token).getString(), ((SQPTerm)token).isQuoted());
+ } else if (token instanceof SQPRangeTerm){
+ SQPRangeTerm rt = (SQPRangeTerm)token;
+ q = getRangeQuery(field, rt.getStart(), rt.getEnd(),
+ rt.getStartInclusive(), rt.getEndInclusive());
+ }
+ if (q != null && token instanceof SQPBoostableToken){
+ float boost = ((SQPBoostableToken)token).getBoost();
+ if (boost != SpanQueryParserBase.UNSPECIFIED_BOOST){
+ q.setBoost(boost);
+ }
+ }
+ if (q != null && q instanceof SpanQuery){
+ return (SpanQuery)q;
+ }
+ return null;
+ }
+
+ private SpanQuery buildSpanQueryClause(List queries, SQPClause clause)
+ throws ParseException {
+ SpanQuery q = null;
+ if (clause instanceof SQPOrClause){
+ q = buildSpanOrQuery(queries);
+ } else if (clause instanceof SQPNearClause){
+
+ int slop = ((SQPNearClause)clause).getSlop();
+ if (slop == UNSPECIFIED_SLOP){
+ slop = getPhraseSlop();
+ }
+ Boolean inOrder = ((SQPNearClause)clause).getInOrder();
+ boolean order = false;
+ if (inOrder == null){
+ order = slop > 0 ? false : true;
+ } else {
+ order = inOrder.booleanValue();
+ }
+ q = buildSpanNearQuery(queries,
+ slop, order);
+ } else if (clause instanceof SQPNotNearClause){
+ q = buildSpanNotNearQuery(queries,
+ ((SQPNotNearClause)clause).getNotPre(),
+ ((SQPNotNearClause)clause).getNotPost());
+
+ }
+
+ if (clause.getBoost() != UNSPECIFIED_BOOST){
+ q.setBoost(clause.getBoost());
+ }
+ //now update boost if clause only had one child
+ if (q.getBoost() == UNSPECIFIED_BOOST &&
+ clause.getBoost() != UNSPECIFIED_BOOST && (
+ q instanceof SpanTermQuery ||
+ q instanceof SpanMultiTermQueryWrapper)){
+ q.setBoost(clause.getBoost());
+ }
+
+ return q;
+ }
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AnalyzingQueryParserBase.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AnalyzingQueryParserBase.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AnalyzingQueryParserBase.java (revision 0)
@@ -0,0 +1,301 @@
+package org.apache.lucene.queryparser.spans;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Version;
+
+/**
+ * Enables setting different Analyzers for different fields.
+ *
+ * Enables setting different analyzers for whole term vs.
+ * multiTerm (wildcard, fuzzy, prefix).
+ *
+ */
+public abstract class AnalyzingQueryParserBase extends QueryParserBase{
+
+ public enum NORM_MULTI_TERMS {
+ ANALYZE,
+ LOWERCASE,
+ NONE
+ };
+
+ private Map wholeTermAnalyzers = new HashMap();
+ private Map multiTermAnalyzers = new HashMap();
+ private NORM_MULTI_TERMS normMultiTerms = NORM_MULTI_TERMS.LOWERCASE;
+
+ private final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)");
+
+
+ private Analyzer multiTermAnalyzer;
+ /**
+ * Default initialization. The analyzer is used for both whole terms and multiTerms.
+ */
+ @Override
+ public void init(Version matchVersion, String f, Analyzer a) {
+ super.init(matchVersion, f, a);
+ this.multiTermAnalyzer = a;
+
+ }
+
+ /**
+ * Expert. Set a different analyzer for whole terms vs. multiTerm subcomponents.
+ *
+ * This initializer has a side effect of setting normMultiTerms = NORM_MULTI_TERMS.ANALYZE
+ * @param matchVersion
+ * @param f
+ * @param a
+ * @param multiTermAnalyzer
+ */
+ public void init(Version matchVersion, String f, Analyzer a, Analyzer multiTermAnalyzer) {
+ super.init(matchVersion, f, a);
+ this.multiTermAnalyzer = multiTermAnalyzer;
+ setNormMultiTerms(NORM_MULTI_TERMS.ANALYZE);
+ }
+
+ /**
+ * Notionally overrides functionality from analyzeMultitermTerm. Differences
+ * are that this consumes the full tokenstream, and it throws ParseException
+ * if it encounters no content terms or more than one.
+ *
+ * @param field
+ * @param part
+ * @param analyzerIn
+ * @return bytesRef to term part
+ * @throws ParseException, RuntimeException
+ */
+ protected BytesRef analyzeMultitermTermParseEx(String field, String part, Analyzer analyzerIn)
+ throws ParseException {
+ //TODO: In QueryParserBase, analyzeMultiTerm doesn't currently consume all tokens, and it
+ //throws RuntimeExceptions and IllegalArgumentExceptions instead of parse.
+ //Otherwise this is copied verbatim.
+ TokenStream source;
+
+ if (analyzerIn == null) analyzerIn = getMultiTermAnalyzer();
+
+ try {
+ source = analyzerIn.tokenStream(field, part);
+ source.reset();
+ } catch (IOException e) {
+ throw new ParseException("Unable to initialize TokenStream to analyze multiTerm term: " + part);
+ }
+
+ TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
+ BytesRef bytes = termAtt.getBytesRef();
+
+ int partCount = 0;
+ try {
+ if (!source.incrementToken()){
+
+ } else {
+ partCount++;
+ termAtt.fillBytesRef();
+ while (source.incrementToken()){
+ partCount++;
+ }
+
+ }
+ } catch (IOException e1) {
+ throw new RuntimeException("Error analyzing multiterm: " + part);
+ }
+
+ try {
+ source.end();
+ source.close();
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part);
+ }
+ if (partCount != 1){
+ throw new ParseException("Couldn't find any content in >"+ part+"<");
+ }
+ return BytesRef.deepCopyOf(bytes);
+ }
+
+ //TODO: make this protected in QueryParserBase and then override it
+ //modify to throw only parse exception
+ protected BytesRef analyzeMultitermTermParseEx(String field, String part) throws ParseException{
+ BytesRef b = null;
+ try{
+ b = analyzeMultitermTermParseEx(field, part, getMultiTermAnalyzer(field));
+ } catch (IllegalArgumentException e){
+ throw new ParseException("Couldn't find any content in >"+ part+"<");
+ }
+ return b;
+ }
+
+ /**
+ * Analysis of wildcards is a bit tricky. This splits a term by wildcard
+ * and then analyzes the subcomponents.
+ *
+ * @param field
+ * @param termText
+ * @return analyzed wildcard
+ * @throws ParseException
+ */
+ protected String analyzeWildcard(String field, String termText) throws ParseException {
+ // plagiarized from AnalyzingQueryParser
+ Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(termText);
+ StringBuilder sb = new StringBuilder();
+ int last = 0;
+
+ while (wildcardMatcher.find()) {
+ // continue if escaped char
+ if (wildcardMatcher.group(1) != null) {
+ continue;
+ }
+
+ if (wildcardMatcher.start() > 0) {
+ String chunk = termText.substring(last, wildcardMatcher.start());
+ BytesRef analyzed = analyzeMultitermTermParseEx(field, chunk);
+ sb.append(analyzed.utf8ToString());
+ }
+ // append the wildcard character
+ sb.append(wildcardMatcher.group(2));
+
+ last = wildcardMatcher.end();
+ }
+ if (last < termText.length()) {
+ sb.append(analyzeMultitermTermParseEx(field, termText.substring(last)).utf8ToString());
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Set different analyzers for different fields. During parsing, if a field isn't
+ * found in this map, the default getAnalyzer() analyzer is used.
+ *
+ * @param wholeTermAnalyzers
+ */
+ public void setAnalyzers(Map wholeTermAnalyzers){
+ this.wholeTermAnalyzers = wholeTermAnalyzers;
+ }
+
+ /**
+ * Expert. Set different analyzers (wholeTerm and multiTerm) for different fields.
+ * During parsing, if a field isn't found in wholeTermAnalyzers, getAnalyzer() is called.
+ * If a field isn't found in multiTermAnalyzers, then getMultiTermAnalyzer() is called.
+ *
+ * As a side effect, this sets normMultiTerms to NORM_MULTI_TERMS.ANALYZE
+ *
+ * If set to null, the default analyzer will be used for all fields.
+ *
+ * @param wholeTermAnalyzers
+ * @param multiTermAnalyzers
+ */
+ public void setAnalyzers(Map wholeTermAnalyzers, Map multiTermAnalyzers){
+ this.wholeTermAnalyzers = wholeTermAnalyzers;
+ this.multiTermAnalyzers = multiTermAnalyzers;
+ normMultiTerms = NORM_MULTI_TERMS.ANALYZE;
+ }
+ /**
+ * If set to true, normMultiTerms is set to NORM_MULTI_TERMS.LOWERCASE.
+ * If set to false, this turns off all normalization and sets normMultiTerms to NORM_MULTI_TERMS.NONE.
+ *
+ * @deprecated use {@link #setNormMultiTerms(NORM_MULTI_TERMS)}
+ */
+ @Override
+ @Deprecated
+ public void setLowercaseExpandedTerms(boolean lc){
+ if (lc == true){
+ normMultiTerms = NORM_MULTI_TERMS.LOWERCASE;
+ } else {
+ normMultiTerms = NORM_MULTI_TERMS.NONE;
+ }
+ super.setLowercaseExpandedTerms(lc);
+ }
+
+ /**
+ * Returns true if normMultiTerms == NORM_MULTI_TERMS.LOWERCASE
+ * @deprecated use {@link #getNormMultiTerms()}
+ */
+ @Override
+ @Deprecated
+ public boolean getLowercaseExpandedTerms(){
+ if (normMultiTerms == NORM_MULTI_TERMS.LOWERCASE){
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ *
+ * @return analyzer to use for multiTerms if a field isn't specified or
+ * not found in the multiTermAnalyzers map.
+ */
+ public Analyzer getMultiTermAnalyzer() {
+ return multiTermAnalyzer;
+ }
+
+ /**
+ *
+ * @return type of normalization to perform on multiTerms
+ */
+ public NORM_MULTI_TERMS getNormMultiTerms() {
+ return normMultiTerms;
+ }
+
+ public void setNormMultiTerms(NORM_MULTI_TERMS norm) {
+ this.normMultiTerms = norm;
+ //TODO: get rid of these side effects once deprecated setLowercaseExpandedTerms is gone.
+ //These are currently needed because (at least) regexp creation
+ //is driven by QueryParserBase, which still relies on these.
+ if (norm == NORM_MULTI_TERMS.LOWERCASE){
+ setLowercaseExpandedTerms(true);
+ } else if (norm == NORM_MULTI_TERMS.NONE){
+ setLowercaseExpandedTerms(false);
+ }
+ }
+
+ /**
+ *
+ * @param field
+ * @return analyzer to use on a requested field for whole terms. Returns getAnalyzer() if
+ * field is not found in wholeTermAnalyzers.
+ */
+ public Analyzer getWholeTermAnalyzer(String field){
+ if (wholeTermAnalyzers != null &&
+ wholeTermAnalyzers.containsKey(field)){
+ return wholeTermAnalyzers.get(field);
+ }
+ return getAnalyzer();
+ }
+
+ /**
+ *
+ * @param field
+ * @return analyzer to use on a requested field for multiTerm terms. Returns getMultiTermAnalyzer()
+ * if field is not found in multiTermAnalyzers
+ */
+ public Analyzer getMultiTermAnalyzer(String field){
+ if (multiTermAnalyzers != null &&
+ multiTermAnalyzers.containsKey(field)){
+ return multiTermAnalyzers.get(field);
+ }
+ return getMultiTermAnalyzer();
+ }
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/package.html
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/package.html (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/package.html (revision 0)
@@ -0,0 +1,28 @@
+
+
+
+
+
+SpanQueryParser is the main class in this package.
+
+
+The SpanOnlyParser parses a subset of the overall syntax (no boolean logic, no field info and no *:*)
+
+
+
+