Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestMultiAnalyzer.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestMultiAnalyzer.java (revision 0) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestMultiAnalyzer.java (revision 0) @@ -0,0 +1,215 @@ +package org.apache.lucene.queryparser.spans; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.spans.SpanQueryParser; + +/** + * Test SpanQueryParser's ability to deal with Analyzers that return more + * than one token per position or that return tokens with a position + * increment > 1. + * + * Copied nearly verbatim from TestMultiAnalyzer for classic QueryParser!!! + * + */ +public class TestMultiAnalyzer extends BaseTokenStreamTestCase{ + private static int multiToken = 0; + + public void testMultiAnalyzer() throws ParseException { + + SpanQueryParser qp = new SpanQueryParser(TEST_VERSION_CURRENT, "", new MultiAnalyzer()); + + // trivial, no multiple tokens: + assertEquals("foo", qp.parse("foo").toString()); + assertEquals("foo", qp.parse("\"foo\"").toString()); + assertEquals("foo foobar", qp.parse("foo foobar").toString()); + assertEquals("spanNear([foo, foobar], 0, true)", qp.parse("\"foo foobar\"").toString()); + assertEquals("spanNear([foo, foobar, blah], 0, true)", qp.parse("\"foo foobar blah\"").toString()); + + // two tokens at the same position: + assertEquals("spanOr([multi, multi2]) foo", qp.parse("multi foo").toString()); + assertEquals("foo spanOr([multi, multi2])", qp.parse("foo multi").toString()); + assertEquals("spanOr([multi, multi2]) spanOr([multi, multi2])", qp.parse("multi multi").toString()); + assertEquals("+(foo spanOr([multi, multi2])) +(bar spanOr([multi, multi2]))", + qp.parse("+(foo multi) +(bar multi)").toString()); + assertEquals("+(foo spanOr([multi, multi2])) spanNear([field:bar, spanOr([field:multi, field:multi2])], 0, true)", + qp.parse("+(foo multi) field:\"bar multi\"").toString()); + + // phrases: + assertEquals("spanNear([spanOr([multi, multi2]), foo], 0, true)", qp.parse("\"multi foo\"").toString()); + assertEquals("spanNear([foo, spanOr([multi, multi2])], 0, true)", qp.parse("\"foo multi\"").toString()); + assertEquals("spanNear([foo, spanOr([multi, multi2]), foobar, spanOr([multi, multi2])], 0, true)", + qp.parse("\"foo multi foobar multi\"").toString()); + + // fields: + assertEquals("spanOr([field:multi, field:multi2]) field:foo", qp.parse("field:multi field:foo").toString()); + assertEquals("spanNear([spanOr([field:multi, field:multi2]), field:foo], 0, true)", qp.parse("field:\"multi foo\"").toString()); + + // three tokens at one position: + assertEquals("spanOr([triplemulti, multi3, multi2])", qp.parse("triplemulti").toString()); + assertEquals("foo spanOr([triplemulti, multi3, multi2]) foobar", + qp.parse("foo triplemulti foobar").toString()); + + // phrase with non-default slop: + assertEquals("spanNear([spanOr([multi, multi2]), foo], 10, false)", qp.parse("\"multi foo\"~10").toString()); + + // phrase with non-default boost: + assertEquals("spanNear([spanOr([multi, multi2]), foo], 0, true)^2.0", qp.parse("\"multi foo\"^2").toString()); + + // phrase after changing default slop + qp.setPhraseSlop(99); + assertEquals("spanNear([spanOr([multi, multi2]), foo], 99, false) bar", + qp.parse("\"multi foo\" bar").toString()); + assertEquals("spanNear([spanOr([multi, multi2]), foo], 99, false) spanNear([foo, bar], 2, false)", + qp.parse("\"multi foo\" \"foo bar\"~2").toString()); + qp.setPhraseSlop(0); + + } + + + public void testPosIncrementAnalyzer() throws ParseException { + SpanQueryParser qp = new SpanQueryParser(TEST_VERSION_CURRENT,"", new PosIncrementAnalyzer()); + assertEquals("quick brown", qp.parse("the quick brown").toString()); + assertEquals("quick brown fox", qp.parse("the quick brown fox").toString()); + } + + /** + * Expands "multi" to "multi" and "multi2", both at the same position, + * and expands "triplemulti" to "triplemulti", "multi3", and "multi2". + */ + private class MultiAnalyzer extends Analyzer { + + @Override + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer result = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); + return new TokenStreamComponents(result, new TestFilter(result)); + } + } + + private final class TestFilter extends TokenFilter { + + private String prevType; + private int prevStartOffset; + private int prevEndOffset; + + private final CharTermAttribute termAtt; + private final PositionIncrementAttribute posIncrAtt; + private final OffsetAttribute offsetAtt; + private final TypeAttribute typeAtt; + + public TestFilter(TokenStream in) { + super(in); + termAtt = addAttribute(CharTermAttribute.class); + posIncrAtt = addAttribute(PositionIncrementAttribute.class); + offsetAtt = addAttribute(OffsetAttribute.class); + typeAtt = addAttribute(TypeAttribute.class); + } + + @Override + public final boolean incrementToken() throws java.io.IOException { + if (multiToken > 0) { + termAtt.setEmpty().append("multi"+(multiToken+1)); + offsetAtt.setOffset(prevStartOffset, prevEndOffset); + typeAtt.setType(prevType); + posIncrAtt.setPositionIncrement(0); + multiToken--; + return true; + } else { + boolean next = input.incrementToken(); + if (!next) { + return false; + } + prevType = typeAtt.type(); + prevStartOffset = offsetAtt.startOffset(); + prevEndOffset = offsetAtt.endOffset(); + String text = termAtt.toString(); + if (text.equals("triplemulti")) { + multiToken = 2; + return true; + } else if (text.equals("multi")) { + multiToken = 1; + return true; + } else { + return true; + } + } + } + + @Override + public void reset() throws IOException { + super.reset(); + this.prevType = null; + this.prevStartOffset = 0; + this.prevEndOffset = 0; + } + } + + /** + * Analyzes "the quick brown" as: quick(incr=2) brown(incr=1). + * Does not work correctly for input other than "the quick brown ...". + */ + private class PosIncrementAnalyzer extends Analyzer { + + @Override + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer result = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); + return new TokenStreamComponents(result, new TestPosIncrementFilter(result)); + } + } + + private final class TestPosIncrementFilter extends TokenFilter { + + CharTermAttribute termAtt; + PositionIncrementAttribute posIncrAtt; + + public TestPosIncrementFilter(TokenStream in) { + super(in); + termAtt = addAttribute(CharTermAttribute.class); + posIncrAtt = addAttribute(PositionIncrementAttribute.class); + } + + @Override + public final boolean incrementToken () throws java.io.IOException { + while(input.incrementToken()) { + if (termAtt.toString().equals("the")) { + // stopword, do nothing + } else if (termAtt.toString().equals("quick")) { + posIncrAtt.setPositionIncrement(2); + return true; + } else { + posIncrAtt.setPositionIncrement(1); + return true; + } + } + return false; + } + } + +} Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQueryParserLexer.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQueryParserLexer.java (revision 0) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQueryParserLexer.java (revision 0) @@ -0,0 +1,801 @@ +package org.apache.lucene.queryparser.spans; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import static org.junit.Assert.*; + +import java.util.List; + +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.spans.SpanQueryLexer; +import org.apache.lucene.queryparser.spans.SpanQueryParserBase; +import org.apache.lucene.queryparser.spans.tokens.SQPBooleanOpToken; +import org.apache.lucene.queryparser.spans.tokens.SQPBoostableToken; +import org.apache.lucene.queryparser.spans.tokens.SQPClause.TYPE; +import org.apache.lucene.queryparser.spans.tokens.SQPField; +import org.apache.lucene.queryparser.spans.tokens.SQPNearClause; +import org.apache.lucene.queryparser.spans.tokens.SQPNotNearClause; +import org.apache.lucene.queryparser.spans.tokens.SQPOrClause; +import org.apache.lucene.queryparser.spans.tokens.SQPRangeTerm; +import org.apache.lucene.queryparser.spans.tokens.SQPRegexTerm; +import org.apache.lucene.queryparser.spans.tokens.SQPTerm; +import org.apache.lucene.queryparser.spans.tokens.SQPToken; + +import org.junit.Test; + +public class TestSpanQueryParserLexer { + SpanQueryLexer lexer = new SpanQueryLexer(); + + @Test + public void testFields() throws ParseException{ + executeSingleTokenTest( + "the quick f1: brown fox", + 2, + new SQPField("f1") + ); + + //no space + executeSingleTokenTest( + "the quick f1:brown fox", + 2, + new SQPField("f1") + ); + + boolean ex = false; + try{ + //non-escaped colon + + executeSingleTokenTest( + "the quick f1:f2:brown fox", + 2, + new SQPField("f1") + ); + } catch (ParseException e){ + ex = true; + } + assertTrue(ex); + //escaped colon + executeSingleTokenTest( + "the quick f1\\:f2:brown fox", + 2, + new SQPField("f1:f2") + ); + + //escaped colon + executeSingleTokenTest( + "the quick f1\\:f2:brown fox", + 3, + new SQPTerm("brown") + ); + executeSingleTokenTest( + "the quick f1\\ f2: brown fox", + 2, + new SQPField("f1 f2") + ); + + //fields should not be tokenized within a regex + executeSingleTokenTest( + "the quick /f1: brown/ fox", + 2, + new SQPRegexTerm("f1: brown") + ); + + //fields are tokenized within parens + executeSingleTokenTest( + "the quick (f1: brown fox)", + 3, + new SQPField("f1") + ); + + ex = false; + try{ + executeSingleTokenTest( + "the quick \"f1: brown fox\"", + 3, + null + ); + } catch (ParseException e){ + ex = true; + } + assertTrue(ex); + ex = false; + try{ + //fields are tokenized within brackets + executeSingleTokenTest( + "the quick [f1: brown fox]", + 3, + new SQPField("f1") + ); + } catch (ParseException e){ + ex = true; + } + assertTrue(ex); + + } + + @Test + public void testRegexes() throws ParseException{ + executeSingleTokenTest( + "the quick [brown (/rabb.?t/ /f?x/)]", + 5, + new SQPRegexTerm("rabb.?t") + ); + + executeSingleTokenTest( + "the quick [brown (ab/rabb.?t/cd /f?x/)]", + 6, + new SQPRegexTerm("rabb.?t") + ); + + //test regex unescape + executeSingleTokenTest( + "the quick [brown (/ra\\wb\\db\\/t/ /f?x/)]", + 5, + new SQPRegexTerm("ra\\wb\\db/t") + ); + + //test operators within regex + executeSingleTokenTest( + "the quick [brown (/(?i)a(b)+[c-e]*(f|g){0,3}/ /f?x/)]", + 5, + new SQPRegexTerm("(?i)a(b)+[c-e]*(f|g){0,3}") + ); + + } + + @Test + public void testOr() throws ParseException{ + SQPOrClause truth = new SQPOrClause(2,5); + truth.setMinimumNumberShouldMatch(SQPOrClause.DEFAULT_MINIMUM_NUMBER_SHOULD_MATCH); + + executeSingleTokenTest( + "the quick (brown fox) jumped", + 2, + truth + ); + + truth.setMinimumNumberShouldMatch(23); + executeSingleTokenTest( + "the quick (brown fox)~23 jumped", + 2, + truth + ); + + truth.setMinimumNumberShouldMatch(2); + executeSingleTokenTest( + "the quick (brown fox)~ jumped", + 2, + truth + ); + + boolean ex = false; + try{ + executeSingleTokenTest( + "the [quick (brown fox)~23 jumped]", + 23, + truth + ); + } catch (ParseException e){ + ex = true; + } + assertTrue(ex); + + ex = false; + try{ + executeSingleTokenTest( + "the [quick (brown fox)~ jumped]", + 3, + truth + ); + } catch (ParseException e){ + ex = true; + } + assertTrue(ex); + + try{ + executeSingleTokenTest( + "the \"quick (brown fox)~23 jumped\"", + 23, + truth + ); + } catch (ParseException e){ + ex = true; + } + assertTrue(ex); + + ex = false; + try{ + executeSingleTokenTest( + "the \"quick (brown fox)~ jumped\"", + 3, + truth + ); + } catch (ParseException e){ + ex = true; + } + assertTrue(ex); + } + + @Test + public void testNear() throws ParseException{ + + SQPNearClause truth = new SQPNearClause(3, 5, TYPE.QUOTE, false, + SQPNearClause.UNSPECIFIED_IN_ORDER, + SpanQueryParserBase.UNSPECIFIED_SLOP); + executeSingleTokenTest( + "the quick \"brown fox\" jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.QUOTE, true, + false, + SpanQueryParserBase.UNSPECIFIED_SLOP); + executeSingleTokenTest( + "the quick \"brown fox\"~ jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.QUOTE, true, + true, + SpanQueryParserBase.UNSPECIFIED_SLOP); + executeSingleTokenTest( + "the quick \"brown fox\"~> jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.QUOTE, true, + false, + 3); + executeSingleTokenTest( + "the quick \"brown fox\"~3 jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.QUOTE, true, + true, + 3); + executeSingleTokenTest( + "the quick \"brown fox\"~>3 jumped", + 2, + truth + ); + + //now try with boosts + truth = new SQPNearClause(3, 5, TYPE.QUOTE, false, + SQPNearClause.UNSPECIFIED_IN_ORDER, + SpanQueryParserBase.UNSPECIFIED_SLOP); + truth.setBoost(new Float(2.5)); + + executeSingleTokenTest( + "the quick \"brown fox\"^2.5 jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.QUOTE, true, + false, + SpanQueryParserBase.UNSPECIFIED_SLOP); + truth.setBoost(new Float(2.5)); + + executeSingleTokenTest( + "the quick \"brown fox\"~^2.5 jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.QUOTE, true, + true, + SpanQueryParserBase.UNSPECIFIED_SLOP); + truth.setBoost(new Float(2.5)); + executeSingleTokenTest( + "the quick \"brown fox\"~>^2.5 jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.QUOTE, true, + false, + 3); + truth.setBoost(new Float(2.5)); + + executeSingleTokenTest( + "the quick \"brown fox\"~3^2.5 jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.QUOTE, true, + true, + 3); + truth.setBoost(new Float(2.5)); + + executeSingleTokenTest( + "the quick \"brown fox\"~>3^2.5 jumped", + 2, + truth + ); + + //now test brackets + truth = new SQPNearClause(3, 5, TYPE.BRACKET, false, + SQPNearClause.UNSPECIFIED_IN_ORDER, + SpanQueryParserBase.UNSPECIFIED_SLOP); + + + executeSingleTokenTest( + "the quick [brown fox] jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.BRACKET, true, + false, + SpanQueryParserBase.UNSPECIFIED_SLOP); + executeSingleTokenTest( + "the quick [brown fox]~ jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.BRACKET, true, + true, + SpanQueryParserBase.UNSPECIFIED_SLOP); + + executeSingleTokenTest( + "the quick [brown fox]~> jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.BRACKET, true, + false, + 3); + + executeSingleTokenTest( + "the quick [brown fox]~3 jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.BRACKET, true, + true, + 3); + + executeSingleTokenTest( + "the quick [brown fox]~>3 jumped", + 2, + truth + ); + + //now brackets with boosts + truth = new SQPNearClause(3, 5, TYPE.BRACKET, false, + SQPNearClause.UNSPECIFIED_IN_ORDER, + SpanQueryParserBase.UNSPECIFIED_SLOP); + truth.setBoost(new Float(2.5)); + + executeSingleTokenTest( + "the quick [brown fox]^2.5 jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.BRACKET, true, + false, + SpanQueryParserBase.UNSPECIFIED_SLOP); + truth.setBoost(new Float(2.5)); + + executeSingleTokenTest( + "the quick [brown fox]~^2.5 jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.BRACKET, true, + true, + SpanQueryParserBase.UNSPECIFIED_SLOP); + truth.setBoost(new Float(2.5)); + executeSingleTokenTest( + "the quick [brown fox]~>^2.5 jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.BRACKET, true, + false, + 3); + truth.setBoost(new Float(2.5)); + + executeSingleTokenTest( + "the quick [brown fox]~3^2.5 jumped", + 2, + truth + ); + + truth = new SQPNearClause(3, 5, TYPE.BRACKET, true, + true, + 3); + truth.setBoost(new Float(2.5)); + + executeSingleTokenTest( + "the quick [brown fox]~>3^2.5 jumped", + 2, + truth + ); + } + + @Test + public void testBoosts() throws Exception { + String s = "apache^4"; + List tokens = lexer.getTokens(s); + SQPToken t = tokens.get(0); + assertEquals(new Float(4), new Float(((SQPBoostableToken)t).getBoost())); + assertTrue(t instanceof SQPTerm); + + s = "/apache/^4"; + tokens = lexer.getTokens(s); + t = tokens.get(0); + assertEquals(new Float(4), new Float(((SQPBoostableToken)t).getBoost())); + assertTrue(t instanceof SQPRegexTerm); + + s = "the [abc TO efg]^4 cat" ; + tokens = lexer.getTokens(s); + t = tokens.get(1); + assertEquals(new Float(4), new Float(((SQPBoostableToken)t).getBoost())); + assertTrue(t instanceof SQPRangeTerm); + + s = "apache^.4"; + tokens = lexer.getTokens(s); + t = tokens.get(0); + assertEquals(new Float(0.4), new Float(((SQPBoostableToken)t).getBoost())); + assertTrue(t instanceof SQPTerm); + + s = "apache^0.4"; + tokens = lexer.getTokens(s); + t = tokens.get(0); + assertEquals(new Float(0.4), new Float(((SQPBoostableToken)t).getBoost())); + assertTrue(t instanceof SQPTerm); + + //negatives should not be parsed as boosts, boost for these should be UNSPECIFIED_BOOST + s = "apache^-4"; + tokens = lexer.getTokens(s); + t = tokens.get(0); + assertEquals(new Float(SpanQueryParserBase.UNSPECIFIED_BOOST), new Float(((SQPBoostableToken)t).getBoost())); + assertTrue(t instanceof SQPTerm); + + s = "apache^-.4"; + tokens = lexer.getTokens(s); + t = tokens.get(0); + assertEquals(new Float(SpanQueryParserBase.UNSPECIFIED_BOOST), new Float(((SQPBoostableToken)t).getBoost())); + assertTrue(t instanceof SQPTerm); + + s = "apache^-0.4"; + tokens = lexer.getTokens(s); + t = tokens.get(0); + assertEquals(new Float(SpanQueryParserBase.UNSPECIFIED_BOOST), new Float(((SQPBoostableToken)t).getBoost())); + assertTrue(t instanceof SQPTerm); + + } + + @Test + public void testNotNear() throws ParseException{ + SQPNotNearClause truth = new SQPNotNearClause(3, 5, TYPE.QUOTE, + SQPNotNearClause.NOT_DEFAULT, SQPNotNearClause.NOT_DEFAULT); + + executeSingleTokenTest( + "the quick \"brown fox\"!~ jumped", + 2, + truth + ); + + truth = new SQPNotNearClause(3, 5, TYPE.QUOTE, + 3, 3); + executeSingleTokenTest( + "the quick \"brown fox\"!~3 jumped", + 2, + truth + ); + + truth = new SQPNotNearClause(3, 5, TYPE.QUOTE, + 3, 4); + executeSingleTokenTest( + "the quick \"brown fox\"!~3,4 jumped", + 2, + truth + ); + + truth = new SQPNotNearClause(3, 5, TYPE.BRACKET, + SQPNotNearClause.NOT_DEFAULT, + SQPNotNearClause.NOT_DEFAULT); + + executeSingleTokenTest( + "the quick [brown fox]!~ jumped", + 2, + truth + ); + + truth = new SQPNotNearClause(3, 5, TYPE.BRACKET, + 3, + 3); + executeSingleTokenTest( + "the quick [brown fox]!~3 jumped", + 2, + truth + ); + + truth = new SQPNotNearClause(3, 5, TYPE.BRACKET, + 3, + 4); + executeSingleTokenTest( + "the quick [brown fox]!~3,4 jumped", + 2, + truth + ); + } + + @Test + public void testUnescapes() throws ParseException{ + //lexer should unescape field names + //and boolean operators but nothing else + //the parser may need the escapes for determining type of multiterm + //and a few other things + + executeSingleTokenTest( + "the qu\\(ck", + 1, + new SQPTerm("qu\\(ck") + ); + + executeSingleTokenTest( + "the qu\\[ck", + 1, + new SQPTerm("qu\\[ck") + ); + + executeSingleTokenTest( + "the qu\\+ck", + 1, + new SQPTerm("qu\\+ck") + ); + executeSingleTokenTest( + "the qu\\-ck", + 1, + new SQPTerm("qu\\-ck") + ); + + executeSingleTokenTest( + "the qu\\\\ck", + 1, + new SQPTerm("qu\\\\ck") + ); + + executeSingleTokenTest( + "the qu\\ ck", + 1, + new SQPTerm("qu\\ ck") + ); + + executeSingleTokenTest( + "the field\\: quick", + 1, + new SQPTerm("field\\:") + ); + + executeSingleTokenTest( + "the quick \\AND nimble", + 2, + new SQPTerm("AND") + ); + + executeSingleTokenTest( + "the quick \\NOT nimble", + 2, + new SQPTerm("NOT") + ); + + executeSingleTokenTest( + "the quick \\OR nimble", + 2, + new SQPTerm("OR") + ); + + executeSingleTokenTest( + "the \\+ (quick -nimble)", + 1, + new SQPTerm("\\+") + ); + } + + + @Test + public void testBoolean() throws Exception{ + + executeSingleTokenTest( + "the quick AND nimble", + 2, + new SQPBooleanOpToken(SpanQueryParserBase.CONJ_AND) + ); + + executeSingleTokenTest( + "the quick NOT nimble", + 2, + new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT) + ); + + executeSingleTokenTest( + "the (quick NOT nimble) fox", + 3, + new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT) + ); + + + //not sure this is the right behavior + //lexer knows when it is in a near clause and doesn't parse + //boolean operators + executeSingleTokenTest( + "the [quick NOT nimble] fox", + 3, + new SQPTerm("NOT") + ); + + executeSingleTokenTest( + "the +quick +nimble", + 1, + new SQPBooleanOpToken(SpanQueryParserBase.MOD_REQ) + ); + + executeSingleTokenTest( + "the +quick -nimble", + 3, + new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT) + ); + + executeSingleTokenTest( + "the +(quick -nimble)", + 1, + new SQPBooleanOpToken(SpanQueryParserBase.MOD_REQ) + ); + + executeSingleTokenTest( + "the +(quick -nimble)", + 4, + new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT) + ); + + } + + @Test + public void testRange() throws ParseException{ + executeSingleTokenTest( + "the [abc TO def] cat", + 1, + new SQPRangeTerm("abc", "def", true, true) + ); + + executeSingleTokenTest( + "the [quick brown ([abc TO def] fox)] cat", + 5, + new SQPRangeTerm("abc", "def", true, true) + ); + + SQPNearClause nearClause = new SQPNearClause(2, 5, + TYPE.BRACKET, false, + SQPNearClause.UNSPECIFIED_IN_ORDER, + SpanQueryParserBase.UNSPECIFIED_SLOP); + + + + executeSingleTokenTest( + "the [abc to def] cat", + 1, + nearClause + ); + + executeSingleTokenTest( + "the [abc \\TO def] cat", + 1, + nearClause + ); + + nearClause = new SQPNearClause(1, 4, + TYPE.BRACKET, false, + SQPNearClause.UNSPECIFIED_IN_ORDER, + SpanQueryParserBase.UNSPECIFIED_SLOP); + executeSingleTokenTest( + "[abc to def]", + 0, + nearClause + ); + + //not ranges + nearClause = new SQPNearClause(2, 5, + TYPE.BRACKET, true, + false, + 3); + + executeSingleTokenTest( + "the [abc to def]~3 cat", + 1, + nearClause + ); + + executeSingleTokenTest( + "the [abc TO def]~3 cat", + 1, + nearClause + ); + + SQPNotNearClause notNear = new SQPNotNearClause(2, + 5, TYPE.BRACKET, + 1, + 2); + + executeSingleTokenTest( + "the [abc TO def]!~1,2 cat", + 1, + notNear + ); + + + + //terms in range queries aren't checked for multiterm-hood + executeSingleTokenTest( + "the [abc~2 TO def] cat", + 1, + new SQPRangeTerm("abc~2", "def", true, true) + ); + + //terms in range queries aren't checked for multiterm-hood + executeSingleTokenTest( + "the [abc* TO *def] cat", + 1, + new SQPRangeTerm("abc*", "*def", true, true) + ); + + //\\TO is not unescaped currently + executeSingleTokenTest( + "the [abc \\TO def] cat", + 3, + new SQPTerm("\\TO") + ); + + + } + @Test + public void testBeyondBMP() throws Exception { + String bigChar = new String(new int[]{100000}, 0, 1); + String s = "ab"+bigChar+"cd"; + executeSingleTokenTest( + s, + 0, + new SQPTerm(s) + ); + + } + private void executeSingleTokenTest(String q, int targetOffset, SQPToken truth) + throws ParseException{ + List tokens = lexer.getTokens(q); + SQPToken target = tokens.get(targetOffset); + assertEquals(truth, target); + if (target instanceof SQPBoostableToken && truth instanceof SQPBoostableToken){ + assertEquals(((SQPBoostableToken)truth).getBoost(), + ((SQPBoostableToken)target).getBoost(), 0.00001f); + } + } + +} Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java (revision 0) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java (revision 0) @@ -0,0 +1,759 @@ +package org.apache.lucene.queryparser.spans; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.io.IOException; +import java.io.Reader; + +import static org.apache.lucene.util.automaton.BasicAutomata.makeString; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReaderContext; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.spans.SpanOnlyParser; +import org.apache.lucene.queryparser.spans.AnalyzingQueryParserBase.NORM_MULTI_TERMS; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TotalHitCountCollector; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.Version; +import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TestSpanOnlyQueryParser extends LuceneTestCase { + + private static IndexReader reader; + private static IndexSearcher searcher; + private static Directory directory; + private static Analyzer stopAnalyzer; + private static Analyzer noStopAnalyzer; + private static final String FIELD = "f1"; + private static final Version VERSION = Version.LUCENE_50; + + private static final CharacterRunAutomaton STOP_WORDS = new CharacterRunAutomaton( + BasicOperations.union(Arrays.asList(makeString("a"), makeString("an"), + makeString("and"), makeString("are"), makeString("as"), + makeString("at"), makeString("be"), makeString("but"), + makeString("by"), makeString("for"), makeString("if"), + makeString("in"), makeString("into"), makeString("is"), + makeString("it"), makeString("no"), makeString("not"), + makeString("of"), makeString("on"), makeString("or"), + makeString("such"), makeString("that"), makeString("the"), + makeString("their"), makeString("then"), makeString("there"), + makeString("these"), makeString("they"), makeString("this"), + makeString("to"), makeString("was"), makeString("will"), + makeString("with"), makeString("\u5927")))); + + @BeforeClass + public static void beforeClass() throws Exception { + + noStopAnalyzer = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, + true); + TokenFilter filter = new MockStandardTokenizerFilter(tokenizer); + return new TokenStreamComponents(tokenizer, filter); + } + }; + + stopAnalyzer = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, + true); + TokenFilter filter = new MockStandardTokenizerFilter(tokenizer); + filter = new MockTokenFilter(filter, STOP_WORDS); + return new TokenStreamComponents(tokenizer, filter); + } + }; + + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, stopAnalyzer) + .setMaxBufferedDocs(_TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy())); + String[] docs = new String[] { + "the quick brown fox ", + "jumped over the lazy brown dog and the brown green cat", + "quick green fox", + "abcdefghijk", + "over green lazy", + // longish doc for recursion test + "eheu fugaces postume postume labuntur anni nec " + + "pietas moram rugis et instanti senectae " + + "adferet indomitaeque morti", + // non-whitespace language + "\u666E \u6797 \u65AF \u987F \u5927 \u5B66", + "reg/exp", + "/regex/", + "fuzzy~0.6", + "wil*card", + "wil?card", + "prefi*", + + }; + + for (int i = 0; i < docs.length; i++) { + Document doc = new Document(); + doc.add(newTextField(FIELD, docs[i], Field.Store.YES)); + writer.addDocument(doc); + } + reader = writer.getReader(); + searcher = new IndexSearcher(reader); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + directory.close(); + reader = null; + directory = null; + stopAnalyzer = null; + noStopAnalyzer = null; + } + + @Test + public void testBasic() throws Exception { + + SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, stopAnalyzer); + + // test null and empty + boolean ex = false; + try{ + countSpansDocs(p, null, 0, 0); + + } catch (NullPointerException e){ + ex = true; + } + assertEquals(true, ex); + countSpansDocs(p, "", 0, 0); + + countSpansDocs(p, "brown", 3, 2); + + } + + @Test + public void testNear() throws Exception { + SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer); + + boolean exc = false; + + try { + SpanQuery q = (SpanQuery)p.parse("\"brown \"dog\""); + } catch (ParseException e) { + exc = true; + } + assertEquals("unmatching \"", true, exc); + + exc = false; + try { + SpanQuery q = (SpanQuery)p.parse("[brown [dog]"); + } catch (ParseException e) { + exc = true; + } + assertEquals("unmatched [", true, exc); + + testOffsetForSingleSpanMatch(p, "\"brown dog\"", 1, 4, 6); + + countSpansDocs(p, "\"lazy dog\"", 0, 0); + + testOffsetForSingleSpanMatch(p, "\"lazy dog\"~2", 1, 3, 6); + + testOffsetForSingleSpanMatch(p, "\"lazy dog\"~>2", 1, 3, 6); + + testOffsetForSingleSpanMatch(p, "\"dog lazy\"~2", 1, 3, 6); + + countSpansDocs(p, "\"dog lazy\"~>2", 0, 0); + + testOffsetForSingleSpanMatch(p, "[\"lazy dog\"~>2 cat]~10", 1, 3, 11); + + testOffsetForSingleSpanMatch(p, "[\"lazy dog\"~>2 cat]~>10", 1, 3, 11); + + countSpansDocs(p, "[cat \"lazy dog\"~>2]~>10", 0, 0); + + // shows that "intervening" for multiple terms is additive + // 3 includes "over the" and "brown" + testOffsetForSingleSpanMatch(p, "[jumped lazy dog]~3", 1, 0, 6); + + // only two words separate each hit, but together, the intervening words > 2 + countSpansDocs(p, "[jumped lazy dog]~2", 0, 0); + + } + + @Test + public void testNotNear() throws Exception { + SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer); + boolean exc = false; + try { + SpanQuery q = (SpanQuery)p.parse("\"brown dog car\"!~2,2"); + } catch (ParseException e) { + exc = true; + } + assertEquals("must have 2 components", true, exc); + + countSpansDocs(p, "\"brown dog\"!~2,2", 2, 2); + + testOffsetForSingleSpanMatch(p, "\"brown (green dog)\"!~1,1", 0, 2, 3); + + countSpansDocs(p, "\"brown (cat dog)\"!~1,1", 2, 2); + + countSpansDocs(p, "\"brown (quick lazy)\"!~0,4", 3, 2); + + countSpansDocs(p, "\"brown quick\"!~1,4", 2, 1); + + testOffsetForSingleSpanMatch(p, "\"brown (quick lazy)\"!~1,4", 1, 8, 9); + + // test empty + countSpansDocs(p, "\"z y\"!~0,4", 0, 0); + + testOffsetForSingleSpanMatch(p, "[[quick fox]~3 brown]!~1,1", 2, 0, 3); + + // traditional SpanNotQuery + testOffsetForSingleSpanMatch(p, "[[quick fox]~3 brown]!~", 2, 0, 3); + + } + + @Test + public void testWildcard() throws Exception { + SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer); + boolean exc = false; + //default: don't allow leading wildcards + + try { + SpanQuery q = (SpanQuery)p.parse("*og"); + } catch (ParseException e) { + exc = true; + } + assertEquals("no leading wildcards \"", true, exc); + p.setAllowLeadingWildcard(true); + + // lowercasing as default + testOffsetForSingleSpanMatch(p, "*OG", 1, 5, 6); + + p.setNormMultiTerms(NORM_MULTI_TERMS.NONE); + + countSpansDocs(p, "*OG", 0, 0); + + testOffsetForSingleSpanMatch(p, "*og", 1, 5, 6); + testOffsetForSingleSpanMatch(p, "?og", 1, 5, 6); + + // brown dog and brown fox + countSpansDocs(p, "[brown ?o?]", 2, 2); + countSpansDocs(p, "[br* ?o?]", 2, 2); + } + + @Test + public void testPrefix() throws Exception { + SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer); + + // lowercasing as default + countSpansDocs(p, "BR*", 3, 2); + + countSpansDocs(p, "br*", 3, 2); + + p.setNormMultiTerms(NORM_MULTI_TERMS.NONE); + countSpansDocs(p, "BR*", 0, 0); + + // not actually a prefix query + countSpansDocs(p, "br?", 0, 0); + + p.setAllowLeadingWildcard(true); + countSpansDocs(p, "*", 45, 13); + + } + + @Test + public void testRegex() throws Exception { + SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer); + + + countSpansDocs(p, "/b[wor]+n/", 3, 2); + countSpansDocs(p, " /b[wor]+n/ ", 3, 2); + + testOffsetForSingleSpanMatch(p, " [/b[wor]+n/ fox]", 0, 2, 4); + + testOffsetForSingleSpanMatch(p, " [/b[wor]+n/fox]", 0, 2, 4); + + countSpansDocs(p, " [/b[wor]+n/ (fox dog)]", 2, 2); + + //default is to set to lowercase + countSpansDocs(p, "/B[wor]+n/", 3, 2); + + p.setNormMultiTerms(NORM_MULTI_TERMS.NONE); + countSpansDocs(p, "/B[wor]+n/", 0, 0); + + //test special regex escape + countSpansDocs(p, "/reg\\/exp/", 1, 1); + } + + @Test + public void testFuzzy() throws Exception { + //could use more testing of requested and fuzzyMinSim < 1.0f + SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer); + + countSpansDocs(p, "bruun~", 3, 2); + countSpansDocs(p, "bruun~2", 3, 2); + + //default should reduce 3 to 2 and therefore not have any hits + countSpansDocs(p, "abcdefgh~3", 0, 0); + + p.setFuzzyMinSim(3.0f); + testOffsetForSingleSpanMatch(p, "abcdefgh~3", 3, 0, 1); + + // default lowercasing + testOffsetForSingleSpanMatch(p, "Abcdefgh~3", 3, 0, 1); + p.setNormMultiTerms(NORM_MULTI_TERMS.NONE); + countSpansDocs(p, "Abcdefgh~3", 0, 0); + + countSpansDocs(p, "brwon~1", 3, 2); + countSpansDocs(p, "brwon~>1", 0, 0); + + countSpansDocs(p, "crown~1,1", 0, 0); + countSpansDocs(p, "crown~2,1", 0, 0); + countSpansDocs(p, "crown~3,1", 0, 0); + countSpansDocs(p, "brwn~1,1", 3, 2); + + p.setFuzzyMinSim(0.6f); + countSpansDocs(p, "brwon~0.80", 3, 2); + + p.setFuzzyMinSim(0.85f); + countSpansDocs(p, "brwon~0.80", 0, 0); + + p.setFuzzyMinSim(0.80f); + + countSpansDocs(p, "brwon~2", 3, 2); + + p.setFuzzyMinSim(0.60f); + //this requires edit = 3 + testOffsetForSingleSpanMatch(p, "abcdefgh~0.60", 3, 0, 1); + + p.setFuzzyMinSim(0.65f); + //this requires edit = 3, 63% + countSpansDocs(p, "abcdefgh~0.60", 0, 0); + + //fuzzy val of 0 should yield straight SpanTermQuery + Query q = p.parse("brown~0.0"); + assertTrue("fuzzy val = 0.0", q instanceof SpanTermQuery); + q = p.parse("brown~0"); + assertTrue("fuzzy val = 0", q instanceof SpanTermQuery); + + } + + @Test + public void testStopWords() throws Exception { + // Stop word handling has some room for improvement with SpanQuery + // These tests codify the expectations (for regular behavior, + // parse exceptions and false hits) as of this writing. + + SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, stopAnalyzer); + + countSpansDocs(p, "the", 0, 0); + + // these are whittled down to just a query for brown + countSpansDocs(p, "[the brown]", 3, 2); + + countSpansDocs(p, "(the brown)", 3, 2); + + testException(p, "[brown the]!~5,5"); + + // this will not match because "the" is silently dropped from the query + countSpansDocs(p, "[over the lazy]", 0, 0); + + // this will get one right hit, but incorrectly match "over green lazy" + countSpansDocs(p, "[over the lazy]~1", 2, 2); + + // test throw exception + p.setThrowExceptionForEmptyTerm(true); + p.setNormMultiTerms(NORM_MULTI_TERMS.ANALYZE); + + String[] stopExs = new String[]{ + "the", + "[the brown]", + "the brown", + "(the brown)", + "\"the brown\"", + "\"the\"", + "[the brown]!~2,2", + "[brown the]!~2,2", + "the*ter", + "the?ter" + }; + for (String ex : stopExs){ + testException(p, ex); + } + + // add tests for surprise phrasal with stopword!!! chinese + + SpanOnlyParser noStopsParser = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer); + noStopsParser.setAutoGeneratePhraseQueries(true); + // won't match because stop word was dropped in index + countSpansDocs(noStopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0); + // won't match for same reason + countSpansDocs(noStopsParser, "[\u666E\u6797\u65AF\u987F\u5927\u5B66]~2", + 0, 0); + + testOffsetForSingleSpanMatch(noStopsParser, + "[\u666E \u6797 \u65AF \u987F \u5B66]~2", 6, 0, 6); + + } + + + + @Test + public void testNonWhiteSpaceLanguage() throws Exception { + SpanOnlyParser noStopsParser = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer); + + testOffsetForSingleSpanMatch(noStopsParser, "\u666E", 6, 0, 1); + + countSpansDocs(noStopsParser, "\u666E\u6797", 2, 1); + + countSpansDocs(noStopsParser, "\u666E\u65AF", 2, 1); + + noStopsParser.setAutoGeneratePhraseQueries(true); + + testOffsetForSingleSpanMatch(noStopsParser, "\u666E\u6797", 6, 0, 2); + + // this would have a hit if autogenerate phrase queries = false + countSpansDocs(noStopsParser, "\u666E\u65AF", 0, 0); + + // treat as "or", this should have two spans + countSpansDocs(noStopsParser, "\u666E \u65AF", 2, 1); + + // stop word removed at indexing time and non existent here, + // this is treated as an exact phrase and should not match + countSpansDocs(noStopsParser, "\u666E\u6797\u65AF\u987F\u5B66", 0, 0); + + // this should be the same as above + countSpansDocs(noStopsParser, "[\u666E \u6797 \u65AF \u987F \u5B66]~0", 0, + 0); + + // look for the same phrase but allow for some slop; this should have one + // hit because this will skip the stop word + + testOffsetForSingleSpanMatch(noStopsParser, + "[\u666E \u6797 \u65AF \u987F \u5B66]~1", 6, 0, 6); + + // This tests the #specialHandlingForSpanNearWithOneComponent + // this is initially treated as [ [\u666E\u6797\u65AF\u5B66]~>0 ]~2 + // with the special treatment, this is rewritten as + // [\u666E \u6797 \u65AF \u5B66]~2 + testOffsetForSingleSpanMatch(noStopsParser, + "[\u666E\u6797\u65AF\u5B66]~2", 6, 0, 6); + + //If someone enters in a space delimited phrase within a phrase, + //treat it literally. There should be no matches. + countSpansDocs(noStopsParser, "[[lazy dog] ]~4", 0, 0); + + noStopsParser.setAutoGeneratePhraseQueries(false); + + // characters split into 2 tokens and treated as an "or" query + countSpansDocs(noStopsParser, "\u666E\u65AF", 2, 1); + + // TODO: Not sure i like how this behaves. + // this is treated as [(\u666E \u6797 \u65AF \u987F \u5B66)]~1 + // which is then simplified to just: (\u666E \u6797 \u65AF \u987F \u5B66) + // Probably better to be treated as [\u666E \u6797 \u65AF \u987F \u5B66]~1 + + testOffsetForSingleSpanMatch(noStopsParser, + "[\u666E\u6797\u65AF\u987F\u5B66]~1", 6, 0, 6); + + SpanOnlyParser stopsParser = new SpanOnlyParser(VERSION, FIELD, stopAnalyzer); + stopsParser.setAutoGeneratePhraseQueries(true); + countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0); + + // now test for throwing of exception + stopsParser.setThrowExceptionForEmptyTerm(true); + boolean exc = false; + try { + countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0); + } catch (ParseException e) { + exc = true; + } + assertEquals(true, exc); + } + + @Test + public void testQuotedSingleTerm() throws Exception{ + SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer); + + String[] quoteds = new String[]{ + "/regex/", + "fuzzy~0.6", + "wil*card", + "wil?card", + "prefi*"}; + + for (String q : quoteds){ + countSpansDocs(p, "\""+q+"\"", 1, 1); + } + } + + @Test + public void testRangeQueries() throws Exception { + //TODO: add tests, now fairly well covered by TestSPanQPBasedonQPTestBase + } + + + + @Test + public void testRecursion() throws Exception { + /* + * For easy reference of expected offsets + * + * 0: eheu 1: fugaces 2: postume 3: postume 4: labuntur 5: anni 6: nec 7: + * pietas 8: moram 9: rugis 10: et 11: instanti 12: senectae 13: adferet 14: + * indomitaeque 15: morti + */ + SpanOnlyParser p = new SpanOnlyParser(VERSION, FIELD, noStopAnalyzer); + + // String q = "[labunt* [pietas [rug?s senec*]!~2,0 ]~4 adferet]~5"; + // String q = "[pietas [rug?s senec*]!~2,0 ]~4"; + // countSpansDocs(p, q, 1, 1); + + // Span extents end at one more than the actual end, e.g.: + String q = "fugaces"; + testOffsetForSingleSpanMatch(p, q, 5, 1, 2); + + q = "morti"; + testOffsetForSingleSpanMatch(p, q, 5, 15, 16); + + q = "[labunt* [pietas [rug?s senec*]~2 ]~4 adferet]~2"; + testOffsetForSingleSpanMatch(p, q, 5, 4, 14); + + // not near query for rugis senectae + q = "[labunt* [pietas [rug?s senec*]!~2 ]~4 adferet]~2"; + countSpansDocs(p, q, 0, 0); + + // not near query for rugis senectae, 0 before or 2 after + // Have to extend overall distance to 5 because hit for + // "rug?s senec*" matches only "rug?s" now + q = "[labunt* [pietas [rug?s senec*]!~2,0 ]~4 adferet]~5"; + testOffsetForSingleSpanMatch(p, q, 5, 4, 14); + + // not near query for rugis senectae, 0 before or 2 intervening + q = "[labunt* [pietas [rug?s senec*]!~0,2 ]~4 adferet]~5"; + testOffsetForSingleSpanMatch(p, q, 5, 4, 14); + + // not near query for rugis senectae, 0 before or 3 intervening + q = "[labunt* [pietas [rug?s senec*]!~0,3 ]~4 adferet]~2"; + countSpansDocs(p, q, 0, 0); + + // directionality specified + q = "[labunt* [pietas [rug?s senec*]~>2 ]~>4 adferet]~>2"; + testOffsetForSingleSpanMatch(p, q, 5, 4, 14); + + // no directionality, query order inverted + q = "[adferet [ [senec* rug?s ]~2 pietas ]~4 labunt*]~2"; + testOffsetForSingleSpanMatch(p, q, 5, 4, 14); + + // more than one word intervenes btwn rugis and senectae + q = "[labunt* [pietas [rug?s senec*]~1 ]~4 adferet]~2"; + countSpansDocs(p, q, 0, 0); + + // more than one word intervenes btwn labuntur and pietas + q = "[labunt* [pietas [rug?s senec*]~2 ]~4 adferet]~1"; + countSpansDocs(p, q, 0, 0); + } + + private void testException(SpanOnlyParser p, String q) throws Exception{ + boolean ex = false; + try{ + countSpansDocs(p, q, 3, 2); + } catch (ParseException e){ + ex = true; + } + assertTrue(q, ex); + + + } + private void countSpansDocs(SpanOnlyParser p, String s, int spanCount, + int docCount) throws Exception { + SpanQuery q = (SpanQuery)p.parse(s); + assertEquals("spanCount: " + s, spanCount, countSpans(q)); + assertEquals("docCount: " + s, docCount, countDocs(q)); + } + + private long countSpans(SpanQuery q) throws Exception { + List ctxs = reader.leaves(); + assert (ctxs.size() == 1); + AtomicReaderContext ctx = ctxs.get(0); + q = (SpanQuery) q.rewrite(ctx.reader()); + Spans spans = q.getSpans(ctx, null, new HashMap()); + + long i = 0; + while (spans.next()) { + i++; + } + return i; + } + + private long countDocs(SpanQuery q) throws Exception { + OpenBitSet docs = new OpenBitSet(); + List ctxs = reader.leaves(); + assert (ctxs.size() == 1); + AtomicReaderContext ctx = ctxs.get(0); + IndexReaderContext parentCtx = reader.getContext(); + q = (SpanQuery) q.rewrite(ctx.reader()); + + Set qTerms = new HashSet(); + q.extractTerms(qTerms); + Map termContexts = new HashMap(); + + for (Term t : qTerms) { + TermContext c = TermContext.build(parentCtx, t); + termContexts.put(t, c); + } + + Spans spans = q.getSpans(ctx, null, termContexts); + + while (spans.next()) { + docs.set(spans.doc()); + } + long spanDocHits = docs.cardinality(); + // double check with a regular searcher + TotalHitCountCollector coll = new TotalHitCountCollector(); + searcher.search(q, coll); + assertEquals(coll.getTotalHits(), spanDocHits); + return spanDocHits; + + } + + private void testOffsetForSingleSpanMatch(SpanOnlyParser p, String s, + int trueDocID, int trueSpanStart, int trueSpanEnd) throws Exception { + SpanQuery q = (SpanQuery)p.parse(s); + List ctxs = reader.leaves(); + assert (ctxs.size() == 1); + AtomicReaderContext ctx = ctxs.get(0); + q = (SpanQuery) q.rewrite(ctx.reader()); + Spans spans = q.getSpans(ctx, null, new HashMap()); + + int i = 0; + int spanStart = -1; + int spanEnd = -1; + int docID = -1; + while (spans.next()) { + spanStart = spans.start(); + spanEnd = spans.end(); + docID = spans.doc(); + i++; + } + assertEquals("should only be one matching span", 1, i); + assertEquals("doc id", trueDocID, docID); + assertEquals("span start", trueSpanStart, spanStart); + assertEquals("span end", trueSpanEnd, spanEnd); + } + + + /** + * Mocks StandardAnalyzer for tokenizing Chinese characters (at least for + * these test cases into individual tokens). + * + */ + private final static class MockStandardTokenizerFilter extends TokenFilter { + // Only designed to handle test cases. You may need to modify this + // if adding new test cases. Note that position increment is hardcoded to be + // 1!!! + private final Pattern hackCJKPattern = Pattern + .compile("([\u5900-\u9899])|([\\p{InBasic_Latin}]+)"); + private List buffer = new LinkedList(); + + private final CharTermAttribute termAtt; + private final PositionIncrementAttribute posIncrAtt; + + public MockStandardTokenizerFilter(TokenStream in) { + super(in); + termAtt = addAttribute(CharTermAttribute.class); + posIncrAtt = addAttribute(PositionIncrementAttribute.class); + } + + @Override + public final boolean incrementToken() throws java.io.IOException { + if (buffer.size() > 0) { + termAtt.setEmpty().append(buffer.remove(0)); + posIncrAtt.setPositionIncrement(1); + return true; + } else { + boolean next = input.incrementToken(); + if (!next) { + return false; + } + // posIncrAtt.setPositionIncrement(1); + String text = termAtt.toString(); + Matcher m = hackCJKPattern.matcher(text); + boolean hasCJK = false; + while (m.find()) { + if (m.group(1) != null) { + hasCJK = true; + buffer.add(m.group(1)); + } else if (m.group(2) != null) { + buffer.add(m.group(2)); + } + } + if (hasCJK == false) { + // don't change the position increment, the super class will handle + // stop words properly + buffer.clear(); + return true; + } + if (buffer.size() > 0) { + termAtt.setEmpty().append(buffer.remove(0)); + posIncrAtt.setPositionIncrement(1); + } + return true; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + } + } +} Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java (revision 0) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java (revision 0) @@ -0,0 +1,304 @@ +package org.apache.lucene.queryparser.spans; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.spans.SpanQueryParser; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; +import org.apache.lucene.util._TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TestOverallSpanQueryParser extends LuceneTestCase{ + private final static String FIELD1 = "f1"; + private final static String FIELD2 = "f2"; + private static Analyzer analyzer = null; + private static Directory directory = null; + private static IndexReader reader = null; + private static IndexSearcher searcher = null; + private static SpanQueryParser parser; + private final static Version VERSION = Version.LUCENE_50; + + @BeforeClass + public static void beforeClass() throws Exception { + analyzer = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, + false); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer) + .setMaxBufferedDocs(_TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy())); + String[] f1Docs = new String[] { + "quick brown AND fox",//0 + "quick brown AND dog", //1 + "quick brown dog", //2 + "whan that aprile with its shoures perced", //3 + "its shoures pierced", //4 + "its shoures perced", //5 + "#####", //before asterisk //6 + "&&&&&", //after asterisk for range query //7 + "ab*de", //8 + "abcde" //9 + + }; + String [] f2Docs = new String[] { + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine" + }; + for (int i = 0; i < f1Docs.length; i++) { + Document doc = new Document(); + doc.add(newTextField(FIELD1, f1Docs[i], Field.Store.YES)); + doc.add(newTextField(FIELD2, f2Docs[i], Field.Store.YES)); + writer.addDocument(doc); + } + reader = writer.getReader(); + searcher = new IndexSearcher(reader); + writer.close(); + + parser = new SpanQueryParser(VERSION, FIELD1, analyzer); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + directory.close(); + reader = null; + searcher = null; + directory = null; + analyzer = null; + } + + + @Test + public void testBooleanQueryConstruction() throws Exception { + String s = "cat dog AND elephant aardvark"; + Query q = parser.parse(s); + assertTrue(q instanceof BooleanQuery); + BooleanQuery bq = (BooleanQuery)q; + List clauses = bq.clauses(); + assertEquals(4, clauses.size()); + testForClause(clauses, "cat", Occur.SHOULD); + testForClause(clauses, "dog", Occur.MUST); + testForClause(clauses, "elephant", Occur.MUST); + testForClause(clauses, "aardvark", Occur.SHOULD); + + s = "cat dog NOT elephant aardvark"; + q = parser.parse(s); + assertTrue(q instanceof BooleanQuery); + bq = (BooleanQuery)q; + clauses = bq.clauses(); + assertEquals(4, clauses.size()); + testForClause(clauses, "cat", Occur.SHOULD); + testForClause(clauses, "dog", Occur.SHOULD); + testForClause(clauses, "elephant", Occur.MUST_NOT); + testForClause(clauses, "aardvark", Occur.SHOULD); + + s = "cat +dog -elephant +aardvark"; + q = parser.parse(s); + assertTrue(q instanceof BooleanQuery); + bq = (BooleanQuery)q; + clauses = bq.clauses(); + assertEquals(4, clauses.size()); + testForClause(clauses, "cat", Occur.SHOULD); + testForClause(clauses, "dog", Occur.MUST); + testForClause(clauses, "elephant", Occur.MUST_NOT); + testForClause(clauses, "aardvark", Occur.MUST); + + } + + @Test + public void testFields() throws Exception { + compareHits("f1:brown f2:three", 0, 1, 2, 3); + + //four should go back to f1 + compareHits("f1:brown f2:three four", 0, 1, 2, 3); + compareHits("f1:brown f2:(three four)", 0, 1, 2, 3, 4); + compareHits("f1:brown f2:(three four) five", 0, 1, 2, 3, 4); + compareHits("f1:brown f2:(three four) f2:five", 0, 1, 2, 3, 4, 5); + compareHits("f1:brown f2:(f1:three four) f2:five", 0, 1, 2, 4, 5); + + SpanQueryParser p = new SpanQueryParser(VERSION, FIELD2, analyzer); + compareHits(p, "f1:brown three four", 0, 1, 2, 3, 4); + compareHits(p, "f1:brown (three four)", 0, 1, 2, 3, 4); + compareHits(p, "f1:brown (three four) five", 0, 1, 2, 3, 4, 5); + compareHits(p, "f1:brown (three four) five", 0, 1, 2, 3, 4, 5); + compareHits(p, "f1:brown (f1:three four) five", 0, 1, 2, 4, 5); + + } + @Test + public void testBooleanOrHits() throws Exception { + compareHits("f2:three (brown dog)", 0, 1, 2, 3); + compareHits("f2:three (brown dog)~2", 1, 2, 3); + } + + @Test + public void testBooleanHits() throws Exception { + //test treatment of AND within phrase + compareHits("quick NOT [brown AND (fox dog)]", 2); + compareHits("quick AND [bruwn~1 AND (f?x do?)]", 0, 1); + compareHits("(whan AND aprile) (shoures NOT perced)", 3, 4); + //test escaping of AND + compareHits("zoo \\AND elephant", 0, 1); + } + + + + private void testForClause(List clauses, String term, Occur occur){ + assertTrue(clauses.contains( + new BooleanClause( + new SpanTermQuery( + new Term(FIELD1, term)), + occur)) + ); + + } + private void compareHits(String s, int ... docids ) throws Exception{ + compareHits(new SpanQueryParser(VERSION, FIELD1, analyzer), s, docids); + } + + private void compareHits(SpanQueryParser p, String s, int ... docids ) throws Exception{ + Query q = p.parse(s); + TopScoreDocCollector results = TopScoreDocCollector.create(1000, true); + searcher.search(q, results); + ScoreDoc[] scoreDocs = results.topDocs().scoreDocs; + Set hits = new HashSet(); + + for (int i = 0; i < scoreDocs.length; i++){ + hits.add(scoreDocs[i].doc); + } + assertEquals(docids.length, hits.size()); + + for (int i = 0; i < docids.length; i++){ + assertTrue(hits.contains(docids[i])); + } + + } + + @Test + public void testExceptions(){ + String[] strings = new String[]{ + "cat OR OR dog", + "cat OR AND dog", + "cat AND AND dog", + "cat NOT NOT dog", + "cat NOT AND dog", + "cat NOT OR dog", + "cat NOT -dog", + "cat NOT +dog", + "OR", + "+", + "AND dog", + "OR dog", + "dog AND", + "dog OR", + "dog NOT", + "dog -", + "dog +"}; + + for (String s : strings){ + testException(s, parser); + } + } + + private void testException(String s, SpanQueryParser p){ + boolean ex = false; + try{ + Query query = p.parse(s); + } catch (ParseException e){ + ex = true; + } catch (Exception e){ + + } + assertTrue(s, ex); + } + + @Test + public void testIsEscaped() throws Exception{ + + String[] notEscaped = new String[]{ + "abcd", + "a\\\\d", + }; + for (String s : notEscaped){ + assertFalse(s, SpanQueryParserBase.isCharEscaped(s, 3)); + } + String[] escaped = new String[]{ + "ab\\d", + "\\\\\\d", + }; + for (String s : escaped){ + assertTrue(s, SpanQueryParserBase.isCharEscaped(s, 3)); + } + + Query q = parser.parse("abc\\~2.0"); + assertTrue(q.toString(), q instanceof SpanTermQuery); + q = parser.parse("abc\\\\\\~2.0"); + assertTrue(q.toString(), q instanceof SpanTermQuery); + q = parser.parse("abc\\\\~2.0"); + assertTrue(q.toString(), q instanceof SpanMultiTermQueryWrapper); + + q = parser.parse("abc\\*d"); + assertTrue(q.toString(), q instanceof SpanTermQuery); + + q = parser.parse("abc\\\\\\*d"); + assertTrue(q.toString(), q instanceof SpanTermQuery); + + q = parser.parse("abc\\\\*d"); + assertTrue(q.toString(), q instanceof SpanMultiTermQueryWrapper); + + } +} Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQPBasedOnQPTestBase.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQPBasedOnQPTestBase.java (revision 0) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanQPBasedOnQPTestBase.java (revision 0) @@ -0,0 +1,1223 @@ +package org.apache.lucene.queryparser.spans; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.text.DateFormat; +import java.util.Calendar; +import java.util.Date; +import java.util.GregorianCalendar; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; +import java.util.TimeZone; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.document.DateTools; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParserBase; +import org.apache.lucene.queryparser.classic.QueryParser.Operator; +import org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration; +import org.apache.lucene.queryparser.spans.SpanQueryParser; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.RegExp; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + + +public class TestSpanQPBasedOnQPTestBase extends LuceneTestCase { + + public static Analyzer qpAnalyzer; + public static String FIELD = "f1"; + + @BeforeClass + public static void beforeClass() { + qpAnalyzer = new QPTestAnalyzer(); + } + + @AfterClass + public static void afterClass() { + qpAnalyzer = null; + } + + public static final class QPTestFilter extends TokenFilter { + CharTermAttribute termAtt; + OffsetAttribute offsetAtt; + + /** + * Filter which discards the token 'stop' and which expands the + * token 'phrase' into 'phrase1 phrase2' + */ + public QPTestFilter(TokenStream in) { + super(in); + termAtt = addAttribute(CharTermAttribute.class); + offsetAtt = addAttribute(OffsetAttribute.class); + } + + boolean inPhrase = false; + int savedStart = 0, savedEnd = 0; + + @Override + public boolean incrementToken() throws IOException { + if (inPhrase) { + inPhrase = false; + clearAttributes(); + termAtt.append("phrase2"); + offsetAtt.setOffset(savedStart, savedEnd); + return true; + } else + while (input.incrementToken()) { + if (termAtt.toString().equals("phrase")) { + inPhrase = true; + savedStart = offsetAtt.startOffset(); + savedEnd = offsetAtt.endOffset(); + termAtt.setEmpty().append("phrase1"); + offsetAtt.setOffset(savedStart, savedEnd); + return true; + } else if (!termAtt.toString().equals("stop")) + return true; + } + return false; + } + } + + public static final class QPTestAnalyzer extends Analyzer { + + /** Filters MockTokenizer with StopFilter. */ + @Override + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + return new TokenStreamComponents(tokenizer, new QPTestFilter(tokenizer)); + } + } + + + private int originalMaxClauses; + + + @Override + public void setUp() throws Exception { + super.setUp(); + originalMaxClauses = BooleanQuery.getMaxClauseCount(); + } + + public CommonQueryParserConfiguration getParserConfig(Analyzer a) throws Exception{ + CommonQueryParserConfiguration cqpc = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a); + return cqpc; + } + public Query getQuery(String query) throws Exception { + return getQuery(query, (Analyzer)null); + } + + private Query getQuery(String query, Analyzer analyzer) throws Exception { + Analyzer a = (analyzer == null) ? qpAnalyzer : analyzer; + SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a); + return p.parse(query); + } + + public Query getQuery(String query, CommonQueryParserConfiguration cqpC) throws Exception{ + + SpanQueryParser p = (SpanQueryParser)cqpC; + + return p.parse(query); + } + public void setDateResolution(CommonQueryParserConfiguration cqpC, CharSequence field, DateTools.Resolution value){ + assert (cqpC instanceof SpanQueryParser); + ((SpanQueryParser)cqpC).setDateResolution(field.toString(), value); + } + + private void setAutoGeneratePhraseQueries(CommonQueryParserConfiguration qp, + boolean b) { + assert (qp instanceof SpanQueryParser); + ((SpanQueryParser)qp).setAutoGeneratePhraseQueries(b); + + } + + public void assertQueryEquals(String query, Analyzer a, String result) + throws Exception { + Query q = getQuery(query, a); + String s = q.toString(FIELD); + if (!s.equals(result)) { + fail("Query /" + query + "/ yielded /" + s + + "/, expecting /" + result + "/"); + } + } + + public void assertQueryEquals(CommonQueryParserConfiguration cqpC, String field, String query, String result) + throws Exception { + Query q = getQuery(query, cqpC); + String s = q.toString(field); + if (!s.equals(result)) { + fail("Query /" + query + "/ yielded /" + s + + "/, expecting /" + result + "/"); + } + } + public void assertBoostEquals(String query, float b) + throws Exception { + double precision = 0.00001; + Query q = getQuery(query); + if (Math.abs(q.getBoost() - b) > precision){ + fail("Query /" + query + "/ yielded boost:" + q.getBoost() + + "/, expecting /" + b + "/"); + } + } + + private void assertEqualsWrappedRegexp(RegexpQuery q, Query query) { + assertTrue(query instanceof SpanMultiTermQueryWrapper); + + SpanMultiTermQueryWrapper wrapped = new SpanMultiTermQueryWrapper(q); + + assertEquals(wrapped, query); + } + + public void assertEscapedQueryEquals(String query, Analyzer a, String result) + throws Exception { + String escapedQuery = QueryParserBase.escape(query); + if (!escapedQuery.equals(result)) { + fail("Query /" + query + "/ yielded /" + escapedQuery + + "/, expecting /" + result + "/"); + } + } + + + private void assertMultitermEquals(Query query, + String expected) throws Exception { + assertMultitermEquals(FIELD, query, expected); + } + private void assertMultitermEquals(String field, Query query, + String expected) throws Exception { + expected = "SpanMultiTermQueryWrapper("+field+":"+ expected+")"; + + //need to trim final .0 for fuzzy queries because + //sometimes they appear in the string and sometimes they don't + expected = expected.replace(".0)", ")"); + String qString = query.toString().replace(".0)", ")"); + assertEquals(expected, qString); + + } + + private void assertMultitermEquals(String s, + String expected) throws Exception { + assertMultitermEquals(s, qpAnalyzer, expected); + } + + private void assertMultitermEquals(String s, + String expected, float boost) throws Exception { + Analyzer a = qpAnalyzer; + SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a); + Query q = p.parse(s); + assertMultitermEquals(q, expected); + assertEquals(q.getBoost(), boost, 0.000001f); + } + + private void assertMultitermEquals(String query, boolean b, + String expected) throws Exception { + Analyzer a = qpAnalyzer; + SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a); + p.setLowercaseExpandedTerms(b); + Query q = p.parse(query); + assertMultitermEquals(q, expected); + } + + private void assertMultitermEquals(String field, + String query, Analyzer a, String expected) throws Exception{ + SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a); + Query q = p.parse(query); + assertMultitermEquals(field, q, expected); + } + + private void assertMultitermEquals(String query, Analyzer a, String expected) throws Exception{ + assertMultitermEquals(FIELD, query, a, expected); + } + + private void assertMultitermEquals(String query, boolean lowercase, + String expected, boolean allowLeadingWildcard) throws Exception { + Analyzer a = qpAnalyzer; + SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD, a); + p.setLowercaseExpandedTerms(lowercase); + p.setAllowLeadingWildcard(allowLeadingWildcard); + Query q = p.parse(query); + assertMultitermEquals(q, expected); + } + + private boolean isQueryParserException(Exception pe) { + if (pe instanceof ParseException){ + return true; + } + return false; + } + public void testCJK() throws Exception { + // Test Ideographic Space - As wide as a CJK character cell (fullwidth) + // used google to translate the word "term" to japanese -> 用語 + assertQueryEquals("term\u3000term\u3000term", null, "term\u0020term\u0020term"); + assertQueryEquals("用語\u3000用語\u3000用語", null, "用語\u0020用語\u0020用語"); + } + + //individual CJK chars as terms, like StandardAnalyzer + protected static class SimpleCJKTokenizer extends Tokenizer { + private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + public SimpleCJKTokenizer(Reader input) { + super(input); + } + + @Override + public final boolean incrementToken() throws IOException { + int ch = input.read(); + if (ch < 0) + return false; + clearAttributes(); + termAtt.setEmpty().append((char) ch); + return true; + } + } + + private class SimpleCJKAnalyzer extends Analyzer { + @Override + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + return new TokenStreamComponents(new SimpleCJKTokenizer(reader)); + } + } + + public void testCJKTerm() throws Exception { + // individual CJK chars as terms + SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer(); + + SpanOrQuery expected = new SpanOrQuery( + new SpanQuery[]{ + new SpanTermQuery(new Term(FIELD, "中")), + new SpanTermQuery(new Term(FIELD, "国")) + }); + + assertEquals(expected, getQuery("中国", analyzer)); + } + + public void testCJKBoostedTerm() throws Exception { + // individual CJK chars as terms + SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer(); + + SpanOrQuery expected = new SpanOrQuery( + new SpanQuery[]{ + new SpanTermQuery(new Term(FIELD, "中")), + new SpanTermQuery(new Term(FIELD, "国")) + }); + expected.setBoost(0.5f); + + assertEquals(expected, getQuery("中国^0.5", analyzer)); + } + + public void testCJKPhrase() throws Exception { + // individual CJK chars as terms + SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer(); + + SpanNearQuery expected = new SpanNearQuery( + new SpanQuery[]{ + new SpanTermQuery(new Term(FIELD, "中")), + new SpanTermQuery(new Term(FIELD, "国")) + }, 0, true); + + assertEquals(expected, getQuery("\"中国\"", analyzer)); + } + + public void testCJKBoostedPhrase() throws Exception { + // individual CJK chars as terms + SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer(); + + SpanNearQuery expected = new SpanNearQuery( + new SpanQuery[]{ + new SpanTermQuery(new Term(FIELD, "中")), + new SpanTermQuery(new Term(FIELD, "国")) + }, 0, true); + expected.setBoost(0.5f); + assertEquals(expected, getQuery("\"中国\"^0.5", analyzer)); + } + + public void testCJKSloppyPhrase() throws Exception { + // individual CJK chars as terms + SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer(); + + SpanNearQuery expected = new SpanNearQuery( + new SpanQuery[]{ + new SpanTermQuery(new Term(FIELD, "中")), + new SpanTermQuery(new Term(FIELD, "国")) + }, 3, false); + + assertEquals(expected, getQuery("\"中国\"~3", analyzer)); + } + + + public void testAutoGeneratePhraseQueriesOn() throws Exception { + // individual CJK chars as terms + SimpleCJKAnalyzer analyzer = new SimpleCJKAnalyzer(); + + SpanNearQuery expected = new SpanNearQuery( + new SpanTermQuery[]{ + new SpanTermQuery(new Term(FIELD, "中")), + new SpanTermQuery(new Term(FIELD, "国")) + }, 0, true); + CommonQueryParserConfiguration qp = getParserConfig(analyzer); + setAutoGeneratePhraseQueries(qp, true); + assertEquals(expected, getQuery("中国",qp)); + } + + + + public void testSimple() throws Exception { + assertQueryEquals("term term term", null, "term term term"); + assertQueryEquals("türm term term", new MockAnalyzer(random()), "türm term term"); + assertQueryEquals("ümlaut", new MockAnalyzer(random()), "ümlaut"); + + // FIXME: enhance MockAnalyzer to be able to support this + // it must no longer extend CharTokenizer + //assertQueryEquals("\"\"", new KeywordAnalyzer(), ""); + //assertQueryEquals("foo:\"\"", new KeywordAnalyzer(), "foo:"); + + assertQueryEquals("a AND b", null, "+a +b"); + assertQueryEquals("(a AND b)", null, "+a +b"); + assertQueryEquals("c (a AND b)", null, "c (+a +b)"); + assertQueryEquals("a AND NOT b", null, "+a -b"); + assertQueryEquals("a AND -b", null, "+a -b"); + + assertQueryEquals("a b", null, "a b"); + assertQueryEquals("a -b", null, "a -b"); + + assertQueryEquals("+term -term term", null, "+term -term term"); + assertQueryEquals("foo:term AND "+FIELD+":anotherTerm", null, + "+foo:term +anotherterm"); + assertQueryEquals("term AND \"phrase phrase\"", null, + "+term +spanNear([spanOr([phrase1, phrase2]), "+ + "spanOr([phrase1, phrase2])], 0, true)"); + assertQueryEquals("\"hello there\"", null, "spanNear([hello, there], 0, true)"); + assertTrue(getQuery("a AND b") instanceof BooleanQuery); + assertTrue(getQuery("hello") instanceof SpanTermQuery); + assertTrue(getQuery("\"hello there\"") instanceof SpanNearQuery); + + assertQueryEquals("germ term^2.0", null, "germ term^2.0"); + assertQueryEquals("(term)^2.0", null, "term^2.0"); + assertQueryEquals("(germ term)^2.0", null, "(germ term)^2.0"); + assertQueryEquals("term^2.0", null, "term^2.0"); + assertQueryEquals("term^2", null, "term^2.0"); + assertQueryEquals("\"germ term\"^2.0", null, "spanNear([germ, term], 0, true)^2.0"); + assertQueryEquals("\"term germ\"^2", null, "spanNear([term, germ], 0, true)^2.0"); + + assertQueryEquals("(foo bar) AND (baz boo)", null, + "+(foo bar) +(baz boo)"); + assertQueryEquals("((a b) AND NOT c) d", null, + "(+(a b) -c) d"); + assertQueryEquals("+(apple \"steve jobs\") -(foo bar baz)", null, + "+(apple spanNear([steve, jobs], 0, true)) -(foo bar baz)"); + assertQueryEquals("+title:(dog cat) -author:\"bob dole\"", null, + "+(title:dog title:cat) -spanNear([author:bob, author:dole], 0, true)"); + + } + + + public void testOperatorVsWhitespace() throws Exception { //LUCENE-2566 + // +,-,! should be directly adjacent to operand (i.e. not separated by whitespace) to be treated as an operator + Analyzer a = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + return new TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + } + }; + assertQueryEquals("a - b", a, "a - b"); + assertQueryEquals("a + b", a, "a + b"); + assertQueryEquals("a ! b", a, "a ! b"); + } + + public void testPunct() throws Exception { + Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + assertQueryEquals("a&b", a, "a&b"); + assertQueryEquals("a&&b", a, "a&&b"); + assertQueryEquals(".NET", a, ".NET"); + } + + public void testSlop() throws Exception { + assertQueryEquals("\"term germ\"~2", null, "spanNear([term, germ], 2, false)"); + assertQueryEquals("\"term germ\"~2 flork", null, "spanNear([term, germ], 2, false) flork"); + assertQueryEquals("\"term\"~2", null, "term"); + assertQueryEquals("\" \"~2 germ", null, "germ"); + assertQueryEquals("\"term germ\"~2^2", null, "spanNear([term, germ], 2, false)^2.0"); + } + + public void testNumber() throws Exception { + // The numbers go away because SimpleAnalzyer ignores them + assertQueryEquals("3", null, "spanOr([])"); + assertQueryEquals("term 1.0 1 2", null, "term"); + assertQueryEquals("term term1 term2", null, "term term term"); + + Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true); + assertQueryEquals("3", a, "3"); + assertQueryEquals("term 1.0 1 2", a, "term 1.0 1 2"); + assertQueryEquals("term term1 term2", a, "term term1 term2"); + } + + + public void testWildcard() throws Exception { + assertMultitermEquals("term*", "term*"); + + assertMultitermEquals("term*^2","term*", 2.0f); + assertMultitermEquals("term~", "term~2.0"); + assertMultitermEquals("term~1", "term~1.0"); + assertMultitermEquals("term~0.7","term~1.0"); + assertMultitermEquals("term~^3", "term~2.0", 3.0f); + //not currently supported in SpanQueryParser + // assertWildcardQueryEquals("term^3~", "term~2.0", 3.0f); + assertMultitermEquals("term*germ", "term*germ"); + assertMultitermEquals("term*germ^3", "term*germ", 3.0f); + + + PrefixQuery p = new PrefixQuery(new Term(FIELD, "term")); + SpanQuery wrapped = new SpanMultiTermQueryWrapper(p); + assertEquals(getQuery("term*"), wrapped); + + p = new PrefixQuery(new Term(FIELD, "term")); + wrapped = new SpanMultiTermQueryWrapper(p); + Query parsed = getQuery("term*^2"); + assertEquals(parsed, wrapped); + assertEquals(2.0f, parsed.getBoost(), 0.00001f); + + FuzzyQuery f = new FuzzyQuery(new Term(FIELD, "term"), (int)2.0f); + wrapped = new SpanMultiTermQueryWrapper(f); + + //not great test; better if we could retrieve wrapped query for testing. + //don't want to move these tests to SMTQW package. + assertTrue(getQuery("term~") instanceof SpanMultiTermQueryWrapper); + assertTrue(getQuery("term~0.7") instanceof SpanMultiTermQueryWrapper); + /*can't easily test this; + //FuzzyQuery fq = (FuzzyQuery)getQuery("term~0.7"); + //assertEquals(1, fq.getMaxEdits()); + + + assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); + fq = (FuzzyQuery)getQuery("term~"); + assertEquals(2, fq.getMaxEdits()); + assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); + */ + //not true of SpanQueryParser...rounds value > 1 + //assertParseException("term~1.1"); // value > 1, throws exception + + assertTrue(getQuery("term*germ") instanceof SpanMultiTermQueryWrapper); + + /* Tests to see that wild card terms are (or are not) properly + * lower-cased with propery parser configuration + */ + // First prefix queries: + // by default, convert to lowercase: + + assertMultitermEquals("Term*", true, "term*"); + // explicitly set lowercase: + assertMultitermEquals("term*", true, "term*"); + assertMultitermEquals("Term*", true, "term*"); + assertMultitermEquals("TERM*", true, "term*"); + // explicitly disable lowercase conversion: + assertMultitermEquals("term*", false, "term*"); + assertMultitermEquals("Term*", false, "Term*"); + assertMultitermEquals("TERM*", false, "TERM*"); + // Then 'full' wildcard queries: + // by default, convert to lowercase: + assertMultitermEquals("Te?m", "te?m"); + // explicitly set lowercase: + assertMultitermEquals("te?m", true, "te?m"); + assertMultitermEquals("Te?m", true, "te?m"); + assertMultitermEquals("TE?M", true, "te?m"); + assertMultitermEquals("Te?m*gerM", true, "te?m*germ"); + // explicitly disable lowercase conversion: + assertMultitermEquals("te?m", false, "te?m"); + assertMultitermEquals("Te?m", false, "Te?m"); + assertMultitermEquals("TE?M", false, "TE?M"); + assertMultitermEquals("Te?m*gerM", false, "Te?m*gerM"); + // Fuzzy queries: + assertMultitermEquals("Term~", "term~2.0"); + assertMultitermEquals("Term~", true, "term~2.0"); + assertMultitermEquals("Term~", false, "Term~2.0"); + // Range queries: + assertMultitermEquals("[A TO C]", "[a TO c]"); + assertMultitermEquals("[A TO C]", true, "[a TO c]"); + assertMultitermEquals("[A TO C]", false, "[A TO C]"); + + + // Test suffix queries: first disallow + try { + assertMultitermEquals("*Term", true, "*term"); + } catch(Exception pe) { + // expected exception + if(!isQueryParserException(pe)){ + fail(); + } + } + try { + assertMultitermEquals("?Term", true, "?term"); + fail(); + } catch(Exception pe) { + // expected exception + if(!isQueryParserException(pe)){ + fail(); + } + } + // Test suffix queries: then allow + assertMultitermEquals("*Term", true, "*term", true); + assertMultitermEquals("?Term", true, "?term", true); + } + + + + + + + + public void testLeadingWildcardType() throws Exception { + CommonQueryParserConfiguration cqpC = getParserConfig(null); + cqpC.setAllowLeadingWildcard(true); + assertEquals(SpanMultiTermQueryWrapper.class, getQuery("t*erm*",cqpC).getClass()); + assertEquals(SpanMultiTermQueryWrapper.class, getQuery("?term*",cqpC).getClass()); + assertEquals(SpanMultiTermQueryWrapper.class, getQuery("*term*",cqpC).getClass()); + } + + public void testQPA() throws Exception { + assertQueryEquals("term term^3.0 term", qpAnalyzer, "term term^3.0 term"); + assertQueryEquals("term stop^3.0 term", qpAnalyzer, "term term"); + + assertQueryEquals("term term term", qpAnalyzer, "term term term"); + assertQueryEquals("term +stop term", qpAnalyzer, "term term"); + assertQueryEquals("term -stop term", qpAnalyzer, "term term"); + + assertQueryEquals("drop AND (stop) AND roll", qpAnalyzer, "+drop +roll"); + assertQueryEquals("term +(stop) term", qpAnalyzer, "term term"); + assertQueryEquals("term -(stop) term", qpAnalyzer, "term term"); + + assertQueryEquals("drop AND stop AND roll", qpAnalyzer, "+drop +roll"); + assertQueryEquals("term phrase term", qpAnalyzer, + "term spanOr([phrase1, phrase2]) term"); + assertQueryEquals("term AND NOT phrase term", qpAnalyzer, + "+term -spanOr([phrase1, phrase2]) term"); + assertQueryEquals("stop^3", qpAnalyzer, "spanOr([])"); + assertQueryEquals("stop", qpAnalyzer, "spanOr([])"); + assertQueryEquals("(stop)^3", qpAnalyzer, "spanOr([])"); + assertQueryEquals("((stop))^3", qpAnalyzer, "spanOr([])"); + assertQueryEquals("(stop^3)", qpAnalyzer, "spanOr([])"); + assertQueryEquals("((stop)^3)", qpAnalyzer, "spanOr([])"); + assertQueryEquals("(stop)", qpAnalyzer, "spanOr([])"); + assertQueryEquals("((stop))", qpAnalyzer, "spanOr([])"); + assertTrue(getQuery("term term term", qpAnalyzer) instanceof BooleanQuery); + assertTrue(getQuery("term +stop", qpAnalyzer) instanceof SpanTermQuery); + } + + public void testRange() throws Exception { + assertQueryEquals("[ a TO z]", null, "SpanMultiTermQueryWrapper([a TO z])"); + assertQueryEquals("[ a TO z}", null, "SpanMultiTermQueryWrapper([a TO z})"); + assertQueryEquals("{ a TO z]", null, "SpanMultiTermQueryWrapper({a TO z])"); + assertQueryEquals("{ a TO z}", null, "SpanMultiTermQueryWrapper({a TO z})"); + + //SQP:not sure what this should be + // assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, + // ((SpanMultiTermQueryWrapper)getQuery("[ a TO z]")).getRewriteMethod()); + //TODO: turn back on + /* + CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); + + qp.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + assertEquals(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE,((TermRangeQuery)getQuery("[ a TO z]", qp)).getRewriteMethod()); + + // test open ranges + assertQueryEquals("[ a TO * ]", null, "[a TO *]"); + assertQueryEquals("[ * TO z ]", null, "[* TO z]"); + assertQueryEquals("[ * TO * ]", null, "[* TO *]"); + */ + // mixing exclude and include bounds + assertQueryEquals("{ a TO z ]", null, "SpanMultiTermQueryWrapper({a TO z])"); + assertQueryEquals("[ a TO z }", null, "SpanMultiTermQueryWrapper([a TO z})"); + assertQueryEquals("{ a TO * ]", null, "SpanMultiTermQueryWrapper({a TO \\*])"); + assertQueryEquals("[ * TO z }", null, "SpanMultiTermQueryWrapper([\\* TO z})"); + + assertQueryEquals("[ a TO z ]", null, "SpanMultiTermQueryWrapper([a TO z])"); + assertQueryEquals("{ a TO z}", null, "SpanMultiTermQueryWrapper({a TO z})"); + assertQueryEquals("{ a TO z }", null, "SpanMultiTermQueryWrapper({a TO z})"); + assertQueryEquals("{ a TO z }^2.0", null, "SpanMultiTermQueryWrapper({a TO z})"); + assertBoostEquals("{ a TO z }^2.0", 2.0f); + assertQueryEquals("[ a TO z] OR bar", null, "SpanMultiTermQueryWrapper([a TO z]) bar"); + assertQueryEquals("[ a TO z] AND bar", null, "+SpanMultiTermQueryWrapper([a TO z]) +bar"); + assertQueryEquals("( bar blar { a TO z}) ", null, "bar blar SpanMultiTermQueryWrapper({a TO z})"); + assertQueryEquals("gack ( bar blar { a TO z}) ", null, "gack (bar blar SpanMultiTermQueryWrapper({a TO z}))"); + + assertQueryEquals("[* TO Z]",null,"SpanMultiTermQueryWrapper([\\* TO z])"); + assertQueryEquals("[A TO *]",null,"SpanMultiTermQueryWrapper([a TO \\*])"); + assertQueryEquals("[* TO *]",null,"SpanMultiTermQueryWrapper([\\* TO \\*])"); + } + + public void testRangeWithPhrase() throws Exception { + //different behavior than classic + // assertQueryEquals("[\\* TO \"*\"]",null,"[\\* TO \\*]"); + // assertQueryEquals("[\"*\" TO *]",null,"[\\* TO *]"); + + assertQueryEquals("[\\* TO \"*\"]",null,"SpanMultiTermQueryWrapper([\\* TO \"*\"])"); + assertQueryEquals("[\"*\" TO *]",null,"SpanMultiTermQueryWrapper([\"*\" TO \\*])"); + + } + + private String escapeDateString(String s) { + if (s.indexOf(" ") > -1) { + return "\"" + s + "\""; + } else { + return s; + } + } + + /** for testing DateTools support */ + private String getDate(String s, DateTools.Resolution resolution) throws Exception { + // we use the default Locale since LuceneTestCase randomizes it + DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, Locale.getDefault()); + return getDate(df.parse(s), resolution); + } + + /** for testing DateTools support */ + private String getDate(Date d, DateTools.Resolution resolution) { + return DateTools.dateToString(d, resolution); + } + + private String getLocalizedDate(int year, int month, int day) { + // we use the default Locale/TZ since LuceneTestCase randomizes it + DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, Locale.getDefault()); + Calendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.getDefault()); + calendar.clear(); + calendar.set(year, month, day); + calendar.set(Calendar.HOUR_OF_DAY, 23); + calendar.set(Calendar.MINUTE, 59); + calendar.set(Calendar.SECOND, 59); + calendar.set(Calendar.MILLISECOND, 999); + return df.format(calendar.getTime()); + } + + public void testDateRange() throws Exception { + String startDate = getLocalizedDate(2002, 1, 1); + String endDate = getLocalizedDate(2002, 1, 4); + // we use the default Locale/TZ since LuceneTestCase randomizes it + Calendar endDateExpected = new GregorianCalendar(TimeZone.getDefault(), Locale.getDefault()); + endDateExpected.clear(); + endDateExpected.set(2002, 1, 4, 23, 59, 59); + endDateExpected.set(Calendar.MILLISECOND, 999); + final String defaultField = "default"; + final String monthField = "month"; + final String hourField = "hour"; + Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + CommonQueryParserConfiguration qp = getParserConfig(a); + + // set a field specific date resolution + setDateResolution(qp, monthField, DateTools.Resolution.MONTH); + + // set default date resolution to MILLISECOND + qp.setDateResolution(DateTools.Resolution.MILLISECOND); + + // set second field specific date resolution + setDateResolution(qp, hourField, DateTools.Resolution.HOUR); + + + // for this field no field specific date resolution has been set, + // so verify if the default resolution is used + assertDateRangeQueryEquals(qp, defaultField, startDate, endDate, + endDateExpected.getTime(), DateTools.Resolution.MILLISECOND); + + // verify if field specific date resolutions are used for these two fields + assertDateRangeQueryEquals(qp, monthField, startDate, endDate, + endDateExpected.getTime(), DateTools.Resolution.MONTH); + + assertDateRangeQueryEquals(qp, hourField, startDate, endDate, + endDateExpected.getTime(), DateTools.Resolution.HOUR); + } + + public void assertDateRangeQueryEquals(CommonQueryParserConfiguration cqpC, String field, String startDate, String endDate, + Date endDateInclusive, DateTools.Resolution resolution) throws Exception { + + assertQueryEquals(cqpC, field, field + ":[" + escapeDateString(startDate) + " TO " + escapeDateString(endDate) + "]", + "SpanMultiTermQueryWrapper([" + getDate(startDate, resolution) + " TO " + getDate(endDateInclusive, resolution) + "])"); + + assertQueryEquals(cqpC, field, field + ":{" + escapeDateString(startDate) + " TO " + escapeDateString(endDate) + "}", + "SpanMultiTermQueryWrapper({" + getDate(startDate, resolution) + " TO " + getDate(endDate, resolution) + "})"); + } + + + + + @Test + public void testEscaped() throws Exception { + Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + //commented out in QueryParserTestBase + /* assertQueryEquals("\\[brackets", a, "\\[brackets"); + assertQueryEquals("\\[brackets", null, "brackets"); + assertQueryEquals("\\\\", a, "\\\\"); + assertQueryEquals("\\+blah", a, "\\+blah"); + assertQueryEquals("\\(blah", a, "\\(blah"); + + assertQueryEquals("\\-blah", a, "\\-blah"); + assertQueryEquals("\\!blah", a, "\\!blah"); + assertQueryEquals("\\{blah", a, "\\{blah"); + assertQueryEquals("\\}blah", a, "\\}blah"); + assertQueryEquals("\\:blah", a, "\\:blah"); + assertQueryEquals("\\^blah", a, "\\^blah"); + assertQueryEquals("\\[blah", a, "\\[blah"); + assertQueryEquals("\\]blah", a, "\\]blah"); + assertQueryEquals("\\\"blah", a, "\\\"blah"); + assertQueryEquals("\\(blah", a, "\\(blah"); + assertQueryEquals("\\)blah", a, "\\)blah"); + assertQueryEquals("\\~blah", a, "\\~blah"); + assertQueryEquals("\\*blah", a, "\\*blah"); + assertQueryEquals("\\?blah", a, "\\?blah");*/ + //assertQueryEquals("foo \\&\\& bar", a, "foo \\&\\& bar"); + //assertQueryEquals("foo \\|| bar", a, "foo \\|| bar"); + //assertQueryEquals("foo \\AND bar", a, "foo \\AND bar"); + + assertQueryEquals("\\a", a, "a"); + + assertQueryEquals("a\\-b:c", a, "a-b:c"); + assertQueryEquals("a\\+b:c", a, "a+b:c"); + assertQueryEquals("a\\:b:c", a, "a:b:c"); + assertQueryEquals("a\\\\b:c", a, "a\\b:c"); + + assertQueryEquals("a:b\\-c", a, "a:b-c"); + assertQueryEquals("a:b\\+c", a, "a:b+c"); + assertQueryEquals("a:b\\:c", a, "a:b:c"); + assertQueryEquals("a:b\\\\c", a, "a:b\\c"); + + assertMultitermEquals("a", "a:b\\-c*", a, "b-c*"); + assertMultitermEquals("a", "a:b\\+c*", a, "b+c*"); + assertMultitermEquals("a", "a:b\\:c*", a, "b:c*"); + + assertMultitermEquals("a", "a:b\\\\c*", a, "b\\c*"); + + assertMultitermEquals("a", "a:b\\-c~", a, "b-c~2.0"); + assertMultitermEquals("a", "a:b\\+c~", a, "b+c~2.0"); + assertMultitermEquals("a", "a:b\\:c~", a, "b:c~2.0"); + assertMultitermEquals("a", "a:b\\\\c~", a, "b\\c~2.0"); + + assertMultitermEquals("[ a\\- TO a\\+ ]", "[a- TO a+]"); + assertMultitermEquals("[ a\\: TO a\\~ ]", "[a: TO a~]"); + assertMultitermEquals("[ a\\\\ TO a\\* ]", "[a\\ TO a*]"); + + assertMultitermEquals("[\"c\\:\\\\temp\\\\\\~foo0.txt\" TO \"c\\:\\\\temp\\\\\\~foo9.txt\"]", a, + "[\"c:\\temp\\~foo0.txt\" TO \"c:\\temp\\~foo9.txt\"]"); + //different behavior than classic: doesn't trim leading and trailing quotes + // "[c:\\temp\\~foo0.txt TO c:\\temp\\~foo9.txt]"); + + assertQueryEquals("a\\\\\\+b", a, "a\\+b"); + + assertQueryEquals("a \\\"b c\\\" d", a, "a \"b c\" d"); + assertQueryEquals("\"a \\\"b c\\\" d\"", a, "spanNear([a, \"b, c\", d], 0, true)"); + assertQueryEquals("\"a \\+b c d\"", a, "spanNear([a, +b, c, d], 0, true)"); + + assertQueryEquals("c\\:\\\\temp\\\\\\~foo.txt", a, "c:\\temp\\~foo.txt"); + + assertParseException("XY\\"); // there must be a character after the escape char + + // test unicode escaping + assertQueryEquals("a\\u0062c", a, "abc"); + assertQueryEquals("XY\\u005a", a, "XYZ"); + assertQueryEquals("XY\\u005A", a, "XYZ"); + assertQueryEquals("\"a \\\\\\u0028\\u0062\\\" c\"", a, "spanNear([a, \\(b\", c], 0, true)"); + + assertParseException("XY\\u005G"); // test non-hex character in escaped unicode sequence + assertParseException("XY\\u005"); // test incomplete escaped unicode sequence + + // Tests bug LUCENE-800 + assertQueryEquals("(item:\\\\ item:ABCD\\\\)", a, "item:\\ item:ABCD\\"); + assertParseException("(item:\\\\ item:ABCD\\\\))"); // unmatched closing paranthesis + assertQueryEquals("\\*", a, "*"); + assertQueryEquals("\\\\", a, "\\"); // escaped backslash + + assertParseException("\\"); // a backslash must always be escaped + + // LUCENE-1189 + assertQueryEquals("(\"a\\\\\") or (\"b\")", a ,"a\\ or b"); + + //fails actual LUCENE-1189 test, but so does classic query parser + //assertQueryEquals("(name:\"///mike\\\\\\\") or (name:\"alphonse\")", a, + // "name:///mike\\\\\\ or alphonse"); + } + + + public void testEscapedVsQuestionMarkAsWildcard() throws Exception { + Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + //SpanMultiTermQueryWrapper(a:b-?c) + assertMultitermEquals("a", "a:b\\-?c", a, "b\\-?c"); + assertMultitermEquals("a", "a:b\\+?c", a, "b\\+?c"); + assertMultitermEquals("a", "a:b\\:?c", a, "b\\:?c"); + + assertMultitermEquals("a", "a:b\\\\?c", a, "b\\\\?c"); + } + + public void testQueryStringEscaping() throws Exception { + Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + + assertEscapedQueryEquals("a-b:c", a, "a\\-b\\:c"); + assertEscapedQueryEquals("a+b:c", a, "a\\+b\\:c"); + assertEscapedQueryEquals("a:b:c", a, "a\\:b\\:c"); + assertEscapedQueryEquals("a\\b:c", a, "a\\\\b\\:c"); + + assertEscapedQueryEquals("a:b-c", a, "a\\:b\\-c"); + assertEscapedQueryEquals("a:b+c", a, "a\\:b\\+c"); + assertEscapedQueryEquals("a:b:c", a, "a\\:b\\:c"); + assertEscapedQueryEquals("a:b\\c", a, "a\\:b\\\\c"); + + assertEscapedQueryEquals("a:b-c*", a, "a\\:b\\-c\\*"); + assertEscapedQueryEquals("a:b+c*", a, "a\\:b\\+c\\*"); + assertEscapedQueryEquals("a:b:c*", a, "a\\:b\\:c\\*"); + + assertEscapedQueryEquals("a:b\\\\c*", a, "a\\:b\\\\\\\\c\\*"); + + assertEscapedQueryEquals("a:b-?c", a, "a\\:b\\-\\?c"); + assertEscapedQueryEquals("a:b+?c", a, "a\\:b\\+\\?c"); + assertEscapedQueryEquals("a:b:?c", a, "a\\:b\\:\\?c"); + + assertEscapedQueryEquals("a:b?c", a, "a\\:b\\?c"); + + assertEscapedQueryEquals("a:b-c~", a, "a\\:b\\-c\\~"); + assertEscapedQueryEquals("a:b+c~", a, "a\\:b\\+c\\~"); + assertEscapedQueryEquals("a:b:c~", a, "a\\:b\\:c\\~"); + assertEscapedQueryEquals("a:b\\c~", a, "a\\:b\\\\c\\~"); + + assertEscapedQueryEquals("[ a - TO a+ ]", null, "\\[ a \\- TO a\\+ \\]"); + assertEscapedQueryEquals("[ a : TO a~ ]", null, "\\[ a \\: TO a\\~ \\]"); + assertEscapedQueryEquals("[ a\\ TO a* ]", null, "\\[ a\\\\ TO a\\* \\]"); + + // LUCENE-881 + assertEscapedQueryEquals("|| abc ||", a, "\\|\\| abc \\|\\|"); + assertEscapedQueryEquals("&& abc &&", a, "\\&\\& abc \\&\\&"); + } + + public void testTabNewlineCarriageReturn() + throws Exception { + assertQueryEqualsDOA("+weltbank +worlbank", null, + "+weltbank +worlbank"); + + assertQueryEqualsDOA("+weltbank\n+worlbank", null, + "+weltbank +worlbank"); + assertQueryEqualsDOA("weltbank \n+worlbank", null, + "+weltbank +worlbank"); + assertQueryEqualsDOA("weltbank \n +worlbank", null, + "+weltbank +worlbank"); + + assertQueryEqualsDOA("+weltbank\r+worlbank", null, + "+weltbank +worlbank"); + assertQueryEqualsDOA("weltbank \r+worlbank", null, + "+weltbank +worlbank"); + assertQueryEqualsDOA("weltbank \r +worlbank", null, + "+weltbank +worlbank"); + + assertQueryEqualsDOA("+weltbank\r\n+worlbank", null, + "+weltbank +worlbank"); + assertQueryEqualsDOA("weltbank \r\n+worlbank", null, + "+weltbank +worlbank"); + assertQueryEqualsDOA("weltbank \r\n +worlbank", null, + "+weltbank +worlbank"); + assertQueryEqualsDOA("weltbank \r \n +worlbank", null, + "+weltbank +worlbank"); + + assertQueryEqualsDOA("+weltbank\t+worlbank", null, + "+weltbank +worlbank"); + assertQueryEqualsDOA("weltbank \t+worlbank", null, + "+weltbank +worlbank"); + assertQueryEqualsDOA("weltbank \t +worlbank", null, + "+weltbank +worlbank"); + } + + public void testSimpleDAO() + throws Exception { + assertQueryEqualsDOA("term term term", null, "+term +term +term"); + assertQueryEqualsDOA("term +term term", null, "+term +term +term"); + assertQueryEqualsDOA("term term +term", null, "+term +term +term"); + assertQueryEqualsDOA("term +term +term", null, "+term +term +term"); + assertQueryEqualsDOA("-term term term", null, "-term +term +term"); + } + + public void testBoost() + throws Exception { + CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("on")); + Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); + CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer); + Query q = getQuery("on^1.0",qp); + assertNotNull(q); + q = getQuery("\"hello\"^2.0",qp); + assertNotNull(q); + assertEquals(q.getBoost(), (float) 2.0, (float) 0.01); + q = getQuery("hello^2.0",qp); + assertNotNull(q); + assertEquals(q.getBoost(), (float) 2.0, (float) 0.01); + q = getQuery("\"on\"^1.0",qp); + assertNotNull(q); + + Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); + CommonQueryParserConfiguration qp2 = getParserConfig(a2); + q = getQuery("the^3", qp2); + // "the" is a stop word so the result is an empty query: + assertNotNull(q); + assertEquals("spanOr([])", q.toString()); + assertEquals(1.0f, q.getBoost(), 0.01f); + } + + public void assertParseException(String queryString) throws Exception { + try { + getQuery(queryString); + } catch (Exception expected) { + if(isQueryParserException(expected)){ + return; + } + } + fail("ParseException expected, not thrown"); + } + + public void assertParseException(String queryString, Analyzer a) throws Exception { + try { + getQuery(queryString, a); + } catch (Exception expected) { + if(isQueryParserException(expected)){ + return; + } + } + fail("ParseException expected, not thrown"); + } + + public void testException() throws Exception { + assertParseException("\"some phrase"); + assertParseException("(foo bar"); + assertParseException("foo bar))"); + assertParseException("field:term:with:colon some more terms"); + assertParseException("(sub query)^5.0^2.0 plus more"); + assertParseException("secret AND illegal) AND access:confidential"); + } + + public void testBooleanQuery() throws Exception { + BooleanQuery.setMaxClauseCount(2); + Analyzer purWhitespaceAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + assertParseException("one two three", purWhitespaceAnalyzer); + } + + /** + * This test differs from TestPrecedenceQueryParser + */ + public void testPrecedence() throws Exception { + CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); + Query query1 = getQuery("A AND B OR C AND D", qp); + Query query2 = getQuery("+A +B +C +D", qp); + assertEquals(query1, query2); + } + + // Todo: convert this from DateField to DateUtil + // public void testLocalDateFormat() throws IOException, ParseException { + // Directory ramDir = newDirectory(); + // IndexWriter iw = new IndexWriter(ramDir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false))); + // addDateDoc("a", 2005, 12, 2, 10, 15, 33, iw); + // addDateDoc("b", 2005, 12, 4, 22, 15, 00, iw); + // iw.close(); + // IndexSearcher is = new IndexSearcher(ramDir, true); + // assertHits(1, "[12/1/2005 TO 12/3/2005]", is); + // assertHits(2, "[12/1/2005 TO 12/4/2005]", is); + // assertHits(1, "[12/3/2005 TO 12/4/2005]", is); + // assertHits(1, "{12/1/2005 TO 12/3/2005}", is); + // assertHits(1, "{12/1/2005 TO 12/4/2005}", is); + // assertHits(0, "{12/3/2005 TO 12/4/2005}", is); + // is.close(); + // ramDir.close(); + // } + // + // private void addDateDoc(String content, int year, int month, + // int day, int hour, int minute, int second, IndexWriter iw) throws IOException { + // Document d = new Document(); + // d.add(newField(FIELD, content, Field.Store.YES, Field.Index.ANALYZED)); + // Calendar cal = Calendar.getInstance(Locale.ENGLISH); + // cal.set(year, month - 1, day, hour, minute, second); + // d.add(newField("date", DateField.dateToString(cal.getTime()), Field.Store.YES, Field.Index.NOT_ANALYZED)); + // iw.addDocument(d); + // } + + // public abstract void testStarParsing() throws Exception; + + public void testEscapedWildcard() throws Exception { + CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); + WildcardQuery q = new WildcardQuery(new Term(FIELD, "foo\\?ba?r")); + SpanMultiTermQueryWrapper wq = new SpanMultiTermQueryWrapper(q); + assertEquals(wq, getQuery("foo\\?ba?r", qp)); + } + + public void testRegexps() throws Exception { + CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); + RegexpQuery q = new RegexpQuery(new Term(FIELD, "[a-z][123]")); + assertEqualsWrappedRegexp(q, getQuery("/[a-z][123]/",qp)); + qp.setLowercaseExpandedTerms(true); + assertEqualsWrappedRegexp(q, getQuery("/[A-Z][123]/",qp)); + q.setBoost(0.5f); + //assertEqualsWrappedRegexp(q, getQuery("/[A-Z][123]/^0.5",qp)); + assertBoostEquals("/[A-Z][123]/^0.5", 0.5f); + qp.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + q.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + assertTrue(getQuery("/[A-Z][123]/^0.5",qp) instanceof SpanMultiTermQueryWrapper); + // assertEquals(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE, + // ((SpanMultiTermQueryWrapper)getQuery("/[A-Z][123]/^0.5",qp)).getRewriteMethod()); + // assertEqualsWrappedRegexp(q, getQuery("/[A-Z][123]/^0.5",qp)); + assertBoostEquals("/[A-Z][123]/^0.5", 0.5f); + + qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); + + SpanMultiTermQueryWrapper escaped = + //SQP changed [a-z]\\/[123] to [a-z]/[123] + new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "[a-z]/[123]"))); + + assertEquals(escaped, getQuery("/[a-z]\\/[123]/",qp)); + SpanMultiTermQueryWrapper escaped2 = + new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "[a-z]\\*[123]"))); + assertEquals(escaped2, getQuery("/[a-z]\\*[123]/",qp)); + + BooleanQuery complex = new BooleanQuery(); + complex.add(new SpanMultiTermQueryWrapper( + new RegexpQuery(new Term(FIELD, "[a-z]/[123]"))), Occur.MUST); + complex.add(new SpanTermQuery(new Term("path", "/etc/init.d/")), Occur.MUST); + complex.add(new SpanTermQuery(new Term(FIELD, "/etc/init[.]d/lucene/")), Occur.SHOULD); + // assertEquals(complex, getQuery("/[a-z]\\/[123]/ AND path:\"/etc/init.d/\" OR \"/etc\\/init\\[.\\]d/lucene/\" ",qp)); + assertEquals(complex, getQuery("/[a-z]\\/[123]/ AND path:\\/etc\\/init.d\\/ OR \\/etc\\/init\\[.\\]d/lucene\\/ ",qp)); + + Query re = new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "http.*"))); + assertEquals(re, getQuery(FIELD+":/http.*/",qp)); + assertEquals(re, getQuery("/http.*/",qp)); + + re = new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "http~0.5"))); + assertEquals(re, getQuery(FIELD+":/http~0.5/",qp)); + assertEquals(re, getQuery("/http~0.5/",qp)); + + re = new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "boo"))); + assertEquals(re, getQuery(FIELD+":/boo/",qp)); + assertEquals(re, getQuery("/boo/",qp)); + + // assertEquals(new SpanTermQuery(new Term(FIELD, "/boo/")), getQuery("\"/boo/\"",qp)); + assertEquals(new SpanTermQuery(new Term(FIELD, "/boo/")), getQuery("\\/boo\\/",qp)); + + BooleanQuery two = new BooleanQuery(); + two.add(new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "foo"))), Occur.SHOULD); + two.add(new SpanMultiTermQueryWrapper(new RegexpQuery(new Term(FIELD, "bar"))), Occur.SHOULD); + assertEquals(two, getQuery(FIELD+":/foo/ "+FIELD+":/bar/",qp)); + assertEquals(two, getQuery("/foo/ /bar/",qp)); + } + + + public void testStopwords() throws Exception { + CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton()); + CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet)); + Query result = getQuery("field:the OR field:foo",qp); + assertNotNull("result is null and it shouldn't be", result); + assertTrue("result is not a BooleanQuery", result instanceof SpanOrQuery); + assertEquals(0, ((SpanOrQuery)result).getClauses().length); + result = getQuery("field:woo OR field:the",qp); + assertNotNull("result is null and it shouldn't be", result); + assertTrue("result is not a TermQuery", result instanceof SpanTermQuery); + result = getQuery("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)",qp); + assertNotNull("result is null and it shouldn't be", result); + assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery); + if (VERBOSE) System.out.println("Result: " + result); + assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 2, ((BooleanQuery) result).clauses().size() == 2); + } + + public void testPositionIncrement() throws Exception { + CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)); + qp.setEnablePositionIncrements(true); + String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\""; + // 0 2 5 7 8 + SpanNearQuery pq = (SpanNearQuery) getQuery(qtxt,qp); + //System.out.println("Query text: "+qtxt); + //System.out.println("Result: "+pq); + SpanQuery[] clauses = pq.getClauses(); + assertEquals(clauses.length, 5); + Set expected = new HashSet(); + expected.add(new Term(FIELD, "words")); + expected.add(new Term(FIELD, "poisitions")); + expected.add(new Term(FIELD, "pos")); + expected.add(new Term(FIELD, "stopped")); + expected.add(new Term(FIELD, "phrasequery")); + + Set terms = new HashSet(); + for (int i = 0; i < clauses.length; i++){ + SpanQuery q = clauses[i]; + q.extractTerms(terms); + } + assertEquals(expected, terms); + } + + public void testMatchAllDocs() throws Exception { + CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); + assertEquals(new MatchAllDocsQuery(), getQuery("*:*",qp)); + assertEquals(new MatchAllDocsQuery(), getQuery("(*:*)",qp)); + BooleanQuery bq = (BooleanQuery)getQuery("+*:* -*:*",qp); + assertTrue(bq.getClauses()[0].getQuery() instanceof MatchAllDocsQuery); + assertTrue(bq.getClauses()[1].getQuery() instanceof MatchAllDocsQuery); + } + + + @Override + public void tearDown() throws Exception { + BooleanQuery.setMaxClauseCount(originalMaxClauses); + super.tearDown(); + } + + public Query getQueryDOA(String query, Analyzer a) + throws Exception { + if (a == null) + a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + CommonQueryParserConfiguration qp = getParserConfig(a); + setDefaultOperatorAND(qp); + return getQuery(query, qp); + } + + private void setDefaultOperatorAND(CommonQueryParserConfiguration qp) { + ((SpanQueryParser)qp).setDefaultOperator(Operator.AND); + } + + + public void assertQueryEqualsDOA(String query, Analyzer a, String result) + throws Exception { + Query q = getQueryDOA(query, a); + String s = q.toString(FIELD); + if (!s.equals(result)) { + fail("Query /" + query + "/ yielded /" + s + + "/, expecting /" + result + "/"); + } + } +} Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestAdvancedAnalyzers.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestAdvancedAnalyzers.java (revision 0) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestAdvancedAnalyzers.java (revision 0) @@ -0,0 +1,568 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queryparser.spans; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReaderContext; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.queryparser.spans.AnalyzingQueryParserBase.NORM_MULTI_TERMS; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TotalHitCountCollector; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util._TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TestAdvancedAnalyzers extends LuceneTestCase { + + private static IndexReader reader; + private static IndexSearcher searcher; + private static Directory directory; + private static Analyzer synAnalyzer; + private static Analyzer baseAnalyzer; + private static Analyzer ucVowelAnalyzer; + private static final String FIELD1 = "f1"; + private static final String FIELD2 = "f2"; + private static final String FIELD3 = "f3"; + private static final String FIELD4 = "f4"; + + + // private static final CharacterRunAutomaton STOP_WORDS = new CharacterRunAutomaton( + // BasicOperations.union(Arrays.asList(makeString("a"), makeString("an")))); + + @BeforeClass + public static void beforeClass() throws Exception { + + synAnalyzer = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, + true); + TokenFilter filter = new MockNonWhitespaceFilter(tokenizer); + + filter = new MockSynFilter(filter); + return new TokenStreamComponents(tokenizer, filter); + } + }; + + baseAnalyzer = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, + true); + TokenFilter filter = new MockNonWhitespaceFilter(tokenizer); + return new TokenStreamComponents(tokenizer, filter); + } + }; + + ucVowelAnalyzer = new Analyzer(){ + @Override + public TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, + true); + TokenFilter filter = new MockUCVowelFilter(tokenizer); + return new TokenStreamComponents(tokenizer, filter); + } + }; + Analyzer tmpUCVowelAnalyzer = new Analyzer(){ + @Override + public TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, + true); + TokenFilter filter = new MockUCVowelFilter(tokenizer); + return new TokenStreamComponents(tokenizer, filter); + } + }; + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, baseAnalyzer) + .setMaxBufferedDocs(_TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy())); + String[] docs = new String[] { + "abc_def", + "lmnop", + "abc", + "qrs tuv", + "qrs_tuv" + }; + for (int i = 0; i < docs.length; i++) { + Document doc = new Document(); + doc.add(newTextField(FIELD1, docs[i], Field.Store.YES)); + TextField tf = new TextField(FIELD2, docs[i], Field.Store.YES); + tf.setTokenStream(ucVowelAnalyzer.tokenStream(FIELD2, docs[i])); + doc.add(tf); + doc.add(newTextField(FIELD3, docs[i], Field.Store.YES)); + + TextField tf4 = new TextField(FIELD4, docs[i], Field.Store.YES); + tf4.setTokenStream(tmpUCVowelAnalyzer.tokenStream(FIELD4, docs[i])); + doc.add(tf4); + writer.addDocument(doc); + } + reader = writer.getReader(); + searcher = new IndexSearcher(reader); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + directory.close(); + reader = null; + directory = null; + synAnalyzer = null; + baseAnalyzer = null; + } + + @Test + public void testSynBasic() throws Exception { + + SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, synAnalyzer); + countSpansDocs(p, "tuv", 2, 2); + + countSpansDocs(p, "abc", 6, 4); + + } + + @Test + public void testNonWhiteSpace() throws Exception { + SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT,FIELD1, baseAnalyzer); + String s = "[zqx_qrs^3.0]~3^2"; + Query q = p.parse(s); + assertTrue(q instanceof SpanNearQuery); + + SpanNearQuery near = (SpanNearQuery)q; + SpanQuery[] clauses = near.getClauses(); + assertEquals(2, clauses.length); + + assertEquals(3, near.getSlop()); + assertTrue(clauses[0] instanceof SpanTermQuery); + assertTrue(clauses[1] instanceof SpanTermQuery); + + assertEquals("zqx", ((SpanTermQuery)clauses[0]).getTerm().text()); + assertEquals("qrs", ((SpanTermQuery)clauses[1]).getTerm().text()); + + //take the boost from the phrase, ignore boost on term + //not necessarily right choice, but this is how it works now + assertEquals(2.0f, q.getBoost(), 0.00001f); + + s = "[zqx2_qrs3 lmnop]~3"; + p.setAutoGeneratePhraseQueries(true); + q = p.parse(s); + assertTrue(q instanceof SpanQuery); + assertTrue(q instanceof SpanNearQuery); + near = (SpanNearQuery)q; + clauses = near.getClauses(); + assertEquals(2, clauses.length); + + assertEquals(3, near.getSlop()); + assertTrue(clauses[0] instanceof SpanNearQuery); + assertTrue(clauses[1] instanceof SpanTermQuery); + + SpanNearQuery child = (SpanNearQuery)clauses[0]; + SpanQuery[] childClauses = child.getClauses(); + assertEquals(2, childClauses.length); + + assertEquals("zqx", ((SpanTermQuery)childClauses[0]).getTerm().text()); + assertEquals("qrs", ((SpanTermQuery)childClauses[1]).getTerm().text()); + + assertTrue(child.isInOrder()); + assertEquals(child.getSlop(), 0); + } + + //test different initializations/settings with multifield analyzers + @Test + public void testAnalyzerCombos() throws Exception{ + //wt = whole term + Map wt = new HashMap(); + Map mt = new HashMap(); + + + //basic, correct set up + SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, baseAnalyzer); + assertEquals(1, countDocs((SpanQuery)p.parse("lmnop"))); + assertEquals(1, countDocs((SpanQuery)p.parse("lm*op"))); + assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP"))); + assertEquals(1, countDocs((SpanQuery)p.parse("LM*OP"))); + assertEquals(NORM_MULTI_TERMS.LOWERCASE, p.getNormMultiTerms()); + + + + //basic, correct set up + p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD2, ucVowelAnalyzer); + assertEquals(NORM_MULTI_TERMS.LOWERCASE, p.getNormMultiTerms()); + assertEquals(1, countDocs((SpanQuery)p.parse("lmnop"))); + assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP"))); + assertEquals(0, countDocs((SpanQuery)p.parse("LM*OP"))); + + //set to lowercase only, won't analyze + assertEquals(0, countDocs((SpanQuery)p.parse("lm*op"))); + p.setNormMultiTerms(NORM_MULTI_TERMS.ANALYZE); + assertEquals(1, countDocs((SpanQuery)p.parse("lm*op"))); + assertEquals(1, countDocs((SpanQuery)p.parse("LM*OP"))); + + //try sister field, to prove that default analyzer is ucVowelAnalyzer for + //unspecified fieldsd + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD4+":lmnop"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD4+":lm*op"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD4+":LMNOP"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD4+":LM*OP"))); + + //try mismatching sister field + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD3+":lmnop"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD3+":lm*op"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD3+":LMNOP"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD3+":LM*OP"))); + + //advanced, correct set up (for wt but not for mt) + p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, baseAnalyzer); + assertEquals(p.getNormMultiTerms(), NORM_MULTI_TERMS.LOWERCASE); + wt.clear(); mt.clear(); + wt.put(FIELD2, ucVowelAnalyzer); + p.setAnalyzers(wt, mt); + assertEquals(NORM_MULTI_TERMS.ANALYZE, p.getNormMultiTerms()); + assertEquals(1, countDocs((SpanQuery)p.parse("lmnop"))); + assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmnop"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNOP"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lm*op"))); + + //advanced, correct set up for both + wt.clear(); mt.clear(); + wt.put(FIELD2, ucVowelAnalyzer); + mt.put(FIELD2, ucVowelAnalyzer); + assertEquals(NORM_MULTI_TERMS.ANALYZE, p.getNormMultiTerms()); + assertEquals(1, countDocs((SpanQuery)p.parse("lmnop"))); + assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmnop"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNOP"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lm*op"))); + + p.setNormMultiTerms(NORM_MULTI_TERMS.NONE); + assertEquals(NORM_MULTI_TERMS.NONE, p.getNormMultiTerms()); + assertEquals(1, countDocs((SpanQuery)p.parse("lmnop"))); + //analyzer still used on whole terms; don't forget! + assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP"))); + assertEquals(0, countDocs((SpanQuery)p.parse("LM*OP"))); + + p.setNormMultiTerms(NORM_MULTI_TERMS.LOWERCASE); + assertEquals(NORM_MULTI_TERMS.LOWERCASE, p.getNormMultiTerms()); + assertEquals(1, countDocs((SpanQuery)p.parse("lmnop"))); + assertEquals(1, countDocs((SpanQuery)p.parse("LMNOP"))); + assertEquals(1, countDocs((SpanQuery)p.parse("LM*OP"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP"))); + + //mismatch between default field and default analyzer; should return 0 + p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, ucVowelAnalyzer); + assertEquals(0, countDocs((SpanQuery)p.parse("lmnop"))); + assertEquals(0, countDocs((SpanQuery)p.parse("LMNOP"))); + assertEquals(0, countDocs((SpanQuery)p.parse("lmnOp"))); + + p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, baseAnalyzer, ucVowelAnalyzer); + //cstr with two analyzers sets normMultiTerms = NORM_MULTI_TERM.ANALYZE + //can't find any in field1 because these trigger multiTerm analysis + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":lm*op"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":lmno*"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":lmmop~1"))); + + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":LM*OP"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":LMNO*"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD1+":LMMOP~1"))); + + //can find these in field2 because of multiterm analysis + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lm*op"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmno*"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmmop~1"))); + + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNO*"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMMOP~1"))); + + //try basic use case + p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, baseAnalyzer); + //can't find these in field2 because multiterm analysis is using baseAnalyzer + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lm*op"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmno*"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmmop~1"))); + + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMNO*"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMMOP~1"))); + + p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, baseAnalyzer); + p.setNormMultiTerms(NORM_MULTI_TERMS.ANALYZE); + wt.clear(); + wt.put(FIELD2, ucVowelAnalyzer); + mt.clear(); + mt.put(FIELD2, ucVowelAnalyzer); + p.setAnalyzers(wt, mt); + + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmnop"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lm*op"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmno*"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmmop~1"))); + + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNOP"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNO*"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMMOP~1"))); + + + //now try adding the wrong analyzer for the whole term, but the + //right multiterm analyzer + wt.put(FIELD2, baseAnalyzer); + p.setAnalyzers(wt, mt); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmnop"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lm*op"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmno*"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":lmmop~1"))); + + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMNOP"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMNO*"))); + assertEquals(1, countDocs((SpanQuery)p.parse(FIELD2+":LMMOP~1"))); + + //now set them completely improperly + mt.put(FIELD2, baseAnalyzer); + p.setAnalyzers(wt, mt); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmnop"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lm*op"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmno*"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":lmmop~1"))); + + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMNOP"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LM*OP"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMNO*"))); + assertEquals(0, countDocs((SpanQuery)p.parse(FIELD2+":LMMOP~1"))); + + } + + private void countSpansDocs(SpanQueryParser p, String s, int spanCount, + int docCount) throws Exception { + SpanQuery q = (SpanQuery)p.parse(s); + assertEquals("spanCount: " + s, spanCount, countSpans(q)); + assertEquals("docCount: " + s, docCount, countDocs(q)); + + } + + private long countSpans(SpanQuery q) throws Exception { + List ctxs = reader.leaves(); + assert (ctxs.size() == 1); + AtomicReaderContext ctx = ctxs.get(0); + q = (SpanQuery) q.rewrite(ctx.reader()); + Spans spans = q.getSpans(ctx, null, new HashMap()); + + long i = 0; + while (spans.next()) { + i++; + } + return i; + } + + private long countDocs(SpanQuery q) throws Exception { + OpenBitSet docs = new OpenBitSet(); + List ctxs = reader.leaves(); + assert (ctxs.size() == 1); + AtomicReaderContext ctx = ctxs.get(0); + IndexReaderContext parentCtx = reader.getContext(); + q = (SpanQuery) q.rewrite(ctx.reader()); + + Set qTerms = new HashSet(); + q.extractTerms(qTerms); + Map termContexts = new HashMap(); + + for (Term t : qTerms) { + TermContext c = TermContext.build(parentCtx, t); + termContexts.put(t, c); + } + + Spans spans = q.getSpans(ctx, null, termContexts); + + while (spans.next()) { + docs.set(spans.doc()); + } + long spanDocHits = docs.cardinality(); + // double check with a regular searcher + TotalHitCountCollector coll = new TotalHitCountCollector(); + searcher.search(q, coll); + assertEquals(coll.getTotalHits(), spanDocHits); + return spanDocHits; + + } + + + + /** + * Mocks a synonym filter. When it encounters "abc" it adds "qrs" and "tuv" + */ + private final static class MockSynFilter extends TokenFilter { + private List synBuffer = new LinkedList(); + + private final CharTermAttribute termAtt; + private final PositionIncrementAttribute posIncrAtt; + + public MockSynFilter(TokenStream in) { + super(in); + termAtt = addAttribute(CharTermAttribute.class); + posIncrAtt = addAttribute(PositionIncrementAttribute.class); + } + + @Override + public final boolean incrementToken() throws java.io.IOException { + if (synBuffer.size() > 0) { + termAtt.setEmpty().append(synBuffer.remove(0)); + posIncrAtt.setPositionIncrement(0); + return true; + } else { + boolean next = input.incrementToken(); + if (!next) { + return false; + } + String text = termAtt.toString(); + if (text.equals("abc")){ + synBuffer.add("qrs"); + synBuffer.add("tuv"); + } + return true; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + } + } + + + /** + * Mocks what happens in a non-whitespace language. Tokenizes on white space and "_". + */ + private final static class MockNonWhitespaceFilter extends TokenFilter { + private List buffer = new LinkedList(); + + private final CharTermAttribute termAtt; + + public MockNonWhitespaceFilter(TokenStream in) { + super(in); + termAtt = addAttribute(CharTermAttribute.class); + } + + @Override + public final boolean incrementToken() throws java.io.IOException { + if (buffer.size() > 0) { + termAtt.setEmpty().append(buffer.remove(0)); + return true; + } else { + boolean next = input.incrementToken(); + if (!next) { + return false; + } + String text = termAtt.toString(); + + String[] bits = text.split("_"); + String ret = text; + if (bits.length > 1){ + ret = bits[0]; + for (int i = 1; i < bits.length; i++){ + buffer.add(bits[i]); + } + } + termAtt.setEmpty().append(ret); + return true; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + } + } + + + //mocks uppercasing vowels to test different analyzers for different fields + private final static class MockUCVowelFilter extends TokenFilter { + private final Pattern PATTERN = Pattern.compile("([aeiou])"); + private final CharTermAttribute termAtt; + + public MockUCVowelFilter(TokenStream in) { + super(in); + termAtt = addAttribute(CharTermAttribute.class); + } + + @Override + public final boolean incrementToken() throws java.io.IOException { + + boolean next = input.incrementToken(); + if (!next) { + return false; + } + String text = termAtt.toString().toLowerCase(); + Matcher m = PATTERN.matcher(text); + StringBuffer sb = new StringBuffer(); + while (m.find()){ + m.appendReplacement(sb, m.group(1).toUpperCase()); + } + m.appendTail(sb); + text = sb.toString(); + termAtt.setEmpty().append(text); + return true; + + } + + @Override + public void reset() throws IOException { + super.reset(); + } + } + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java (revision 0) @@ -0,0 +1,988 @@ +package org.apache.lucene.queryparser.spans; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.flexible.core.util.UnescapedCharSequence; +import org.apache.lucene.queryparser.flexible.standard.parser.EscapeQuerySyntaxImpl; +import org.apache.lucene.sandbox.queries.SlowFuzzyQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanNotQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.util.BytesRef; +/** + * This class overrides some important functionality within QueryParserBase, esp. + * for generating terminal spanquery nodes: term, range, regex, fuzzy, etc. + *

+ * When SpanQueries are eventually nuked, there should be an easyish + * refactoring of classes that extend this class to extend QueryParserBase. + *

+ * This should also allow for an easy transfer to javacc or similar. + * + */ +public abstract class SpanQueryParserBase extends AnalyzingQueryParserBase{ + + //better to make these public in QueryParserBase + public static final int CONJ_NONE = 0; + public static final int CONJ_AND = 1; + public static final int CONJ_OR = 2; + + public static final int MOD_NONE = 0; + public static final int MOD_NOT = 10; + public static final int MOD_REQ = 11; + + + public static final float UNSPECIFIED_BOOST = -1.0f; + public static final int UNSPECIFIED_SLOP = -1; + public static final Boolean UNSPECIFIED_IN_ORDER = null; + public static final float DEFAULT_BOOST = 1.0f; + + public static final boolean DEFAULT_IN_ORDER = true; + + + + private static final Pattern FUZZY_PATTERN = Pattern + .compile("~(>)?(?:(\\d+)?(?:\\.(\\d+))?)?(?:,(\\d+))?$"); + private final Pattern WILDCARD_PATTERN = Pattern.compile("([?*]+)"); + + + private int spanNearMaxDistance = 100; + private int spanNotNearMaxDistance = 50; + //if a full term is analyzed and the analyzer returns nothing, + //should a ParseException be thrown or should I just ignore the full token. + private boolean throwExceptionForEmptyTerm = false; + + + + + /////// + // Override getXQueries to return span queries + // Lots of boilerplate. Sorry. + ////// + + //not overriding: newMatchAllDocsQuery + + @Override + protected Query newRegexpQuery(Term t){ + Query q = super.newRegexpQuery(t); + return new SpanMultiTermQueryWrapper((RegexpQuery) q); + } + + + /** + * Factory method for generating a query (similar to + * {@link #getWildcardQuery}). Called when parser parses + * an input term token that has the fuzzy suffix (~) appended. + * + * @param field Name of the field query will use. + * @param termStr Term token to use for building term for the query + * + * @return Resulting {@link org.apache.lucene.search.Query} built for the term + * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow + */ + protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException + { + return getFuzzyQuery(field, termStr, minSimilarity, getFuzzyPrefixLength(), FuzzyQuery.defaultTranspositions); + } + /** + * Factory method for generating a query (similar to + * {@link #getWildcardQuery}). Called when parser parses + * an input term token that has the fuzzy suffix (~) appended. + * + * @param field Name of the field query will use. + * @param termStr Term token to use for building term for the query + * + * @return Resulting {@link org.apache.lucene.search.Query} built for the term + * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow + */ + protected Query getFuzzyQuery(String field, String termStr, + float minSimilarity, int prefixLength) throws ParseException + { + return getFuzzyQuery(field, termStr, minSimilarity, prefixLength, FuzzyQuery.defaultTranspositions); + } + + /** + * + * @param field + * @param termStr + * @param minSimilarity + * @param prefixLength + * @param transpositions + * @return query + * @throws ParseException, RuntimeException if there was an IOException from the analysis process + */ + protected Query getFuzzyQuery(String field, String termStr, + float minSimilarity, int prefixLength, boolean transpositions) throws ParseException{ + if (getNormMultiTerms() == NORM_MULTI_TERMS.ANALYZE){ + termStr = analyzeMultitermTermParseEx(field, termStr).utf8ToString(); + } else if (getNormMultiTerms() == NORM_MULTI_TERMS.LOWERCASE){ + termStr = termStr.toLowerCase(getLocale()); + } + Term t = new Term(field, unescape(termStr)); + return newFuzzyQuery(t, minSimilarity, prefixLength, transpositions); + } + + + + /** + * Creates a new fuzzy term. + * If minimumSimilarity is >= 1.0f, this rounds to avoid + * exception for numEdits != whole number. + * + * @param t + * @param minimumSimilarity + * @param prefixLength + * @param transpositions + * @return fuzzy query + */ + protected Query newFuzzyQuery(Term t, float minimumSimilarity, int prefixLength, + boolean transpositions){ + + if (minimumSimilarity <=0.0f){ + return newTermQuery(t); + } + String text = t.text(); + int numEdits = 0; + int len = text.codePointCount(0, text.length()); + if (getFuzzyMinSim() < 1.0f){ + //if both are < 1.0 then make sure that parameter that was passed in + //is >= than fuzzyminsim + if (minimumSimilarity < 1.0f){ + minimumSimilarity = (minimumSimilarity < getFuzzyMinSim())? getFuzzyMinSim() : minimumSimilarity; + + numEdits = unboundedFloatToEdits(minimumSimilarity, len); + + } else { + //if fuzzyMinSim < 1.0 and the parameter that was passed in + //is >= 1, convert that to a %, test against fuzzyminsim and then + //recalculate numEdits + float tmpSim = (len-minimumSimilarity)/(float)len; + tmpSim = (tmpSim < getFuzzyMinSim())? getFuzzyMinSim() : tmpSim; + numEdits = unboundedFloatToEdits(tmpSim, len); + } + } else { + //if fuzzyMinSim >= 1.0f + + if (minimumSimilarity < 1.0f){ + int tmpNumEdits = unboundedFloatToEdits(minimumSimilarity, len); + numEdits = (tmpNumEdits >= (int)getFuzzyMinSim())?(int)getFuzzyMinSim() : tmpNumEdits; + } else { + numEdits = (minimumSimilarity >= getFuzzyMinSim())? (int) getFuzzyMinSim() : (int)minimumSimilarity; + } + } + /* + * This currently picks btwn FQ and SlowFQ based on numEdits. + * This is only because SFQ doesn't allow transpositions yet. + * Once SFQ does allow transpositions, this can be changed to + * run SFQ only...because SFQ does the right thing and returns + * an Automaton for numEdits <= 2. + */ + if (numEdits <= FuzzyQuery.defaultMaxEdits){ + FuzzyQuery fq =new FuzzyQuery(t, numEdits, prefixLength, FuzzyQuery.defaultMaxExpansions, + transpositions); + fq.setRewriteMethod(getMultiTermRewriteMethod()); + return new SpanMultiTermQueryWrapper(fq); + + } else { + SlowFuzzyQuery sfq = new SlowFuzzyQuery(t, + numEdits, prefixLength); + sfq.setRewriteMethod(getMultiTermRewriteMethod()); + return new SpanMultiTermQueryWrapper(sfq); + } + } + + + + @Override + protected Query newPrefixQuery(Term t){ + PrefixQuery q = new PrefixQuery(t); + q.setRewriteMethod(getMultiTermRewriteMethod()); + return new SpanMultiTermQueryWrapper(q); + + } + /** + * Factory method for generating a query (similar to + * {@link #getWildcardQuery}). Called when parser parses an input term + * token that uses prefix notation; that is, contains a single '*' wildcard + * character as its last character. Since this is a special case + * of generic wildcard term, and such a query can be optimized easily, + * this usually results in a different query object. + *

+ * Depending on settings, a prefix term may be lower-cased + * automatically. It will not go through the default Analyzer, + * however, since normal Analyzers are unlikely to work properly + * with wildcard templates. + *

+ * Can be overridden by extending classes, to provide custom handling for + * wild card queries, which may be necessary due to missing analyzer calls. + * + * @param field Name of the field query will use. + * @param termStr Term token to use for building term for the query + * (without trailing '*' character!) + * + * @return Resulting {@link org.apache.lucene.search.Query} built for the term + * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow + */ + protected Query getPrefixQuery(String field, String termStr) throws ParseException + { + + if (!getAllowLeadingWildcard() && termStr.startsWith("*")) + throw new ParseException("'*' not allowed as first character in PrefixQuery"); + + if (getNormMultiTerms() == NORM_MULTI_TERMS.ANALYZE){ + termStr = analyzeMultitermTermParseEx(field, termStr).utf8ToString(); + } else if (getNormMultiTerms() == NORM_MULTI_TERMS.LOWERCASE){ + termStr = termStr.toLowerCase(getLocale()); + } + Term t = new Term(field, unescape(termStr)); + return newPrefixQuery(t); + } + + @Override + protected Query newWildcardQuery(Term t){ + WildcardQuery q = new WildcardQuery(t); + q.setRewriteMethod(getMultiTermRewriteMethod()); + return new SpanMultiTermQueryWrapper(q); + } + /** + * Factory method for generating a query. Called when parser + * parses an input term token that contains one or more wildcard + * characters (? and *), but is not a prefix term token (one + * that has just a single * character at the end) + *

+ * Depending on settings, prefix term may be lower-cased + * automatically. It will not go through the default Analyzer, + * however, since normal Analyzers are unlikely to work properly + * with wildcard templates. + *

+ * Can be overridden by extending classes, to provide custom handling for + * wildcard queries, which may be necessary due to missing analyzer calls. + * + * @param field Name of the field query will use. + * @param termStr Term token that contains one or more wild card + * characters (? or *), but is not simple prefix term + * + * @return Resulting {@link org.apache.lucene.search.Query} built for the term + * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow + */ + @Override + protected Query getWildcardQuery(String field, String termStr) throws ParseException + { + if ("*".equals(field)) { + if ("*".equals(termStr)) return newMatchAllDocsQuery(); + } + if (!getAllowLeadingWildcard() && (termStr.startsWith("*") || termStr.startsWith("?"))) + throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"); + + if (getNormMultiTerms() == NORM_MULTI_TERMS.ANALYZE){ + + termStr = analyzeWildcard(field, termStr); + } else if (getNormMultiTerms() == NORM_MULTI_TERMS.LOWERCASE){ + termStr = termStr.toLowerCase(getLocale()); + } + + Term t = new Term(field, termStr); + return newWildcardQuery(t); + } + + + /** + * Builds a new {@link TermRangeQuery} instance. + * Will convert to lowercase if {@link #getLowercaseExpandedTerms()} == true. + * Will analyze terms if {@link #getAnalyzeRangeTerms()} == true. + * + * + * @param field Field + * @param part1 min + * @param part2 max + * @param startInclusive true if the start of the range is inclusive + * @param endInclusive true if the end of the range is inclusive + * @return new {@link TermRangeQuery} instance + */ + @Override + protected Query newRangeQuery(String field, String part1, String part2, + boolean startInclusive, boolean endInclusive) { + //TODO: modify newRangeQuery in QueryParserBase to throw ParseException for failure of analysis + //need to copy and paste this until we can change analyzeMultiterm(String field, String part) to protected + //if we just returned a spanmultitermwrapper around super.newRangeQuery(), analyzeMultiterm would use + //the analyzer, but not the multitermAnalyzer + String start = null; + String end = null; + + if (part1 == null) { + start = null; + } else { + if (getAnalyzeRangeTerms()){ + try { + start = analyzeMultitermTermParseEx(field, part1).utf8ToString(); + } catch (ParseException e){ + //swallow + } + } + if ((start == null && getAnalyzeRangeTerms()) || + getNormMultiTerms() == NORM_MULTI_TERMS.LOWERCASE){ + start = part1.toLowerCase(getLocale()); + } else { + start = part1; + } + } + + if (part2 == null) { + end = null; + } else { + if (getAnalyzeRangeTerms()){ + try { + end = analyzeMultitermTermParseEx(field, part1).utf8ToString(); + } catch (ParseException e){ + //swallow..doh! + } + } + if ((end == null && getAnalyzeRangeTerms()) || + getLowercaseExpandedTerms()){ + end = part2.toLowerCase(getLocale()); + } else { + end = part2; + } + } + + final TermRangeQuery query = + TermRangeQuery.newStringRange(field, unescape(start), unescape(end), startInclusive, endInclusive); + + query.setRewriteMethod(getMultiTermRewriteMethod()); + return new SpanMultiTermQueryWrapper(query); + } + + + + + /** + * This identifies and then builds the various single term and/or multiterm + * queries. This does not identify a regex or range term query! + * + *

+ * For {@link org.apache.lucene.search.FuzzyQuery}, this defaults to + * {@link org.apache.lucene.search.FuzzyQuery.defaultMaxEdits} + * if no value is specified after the ~. + * * @param field + * @param termText + * @param quoted + * @return SpanQuery or null if termText is a stop word + * @throws ParseException + */ + public Query buildAnySingleTermQuery(String field, String termText, boolean quoted) throws ParseException { + Query q = null; + + // is this a fuzzy term? + Matcher m = FUZZY_PATTERN.matcher(termText); + if (m.find() && ! isCharEscaped(termText, m.start())) { + String term = termText.substring(0, m.start()); + String transposString = m.group(1); + String minSimilarityString = m.group(2); + String decimalComponent = m.group(3); + String prefixLenString = m.group(4); + float minSimilarity = (float) FuzzyQuery.defaultMaxEdits; + if (minSimilarityString != null && minSimilarityString.length() > 0) { + if (decimalComponent == null || decimalComponent.length() == 0) { + decimalComponent = "0"; + } + try { + minSimilarity = Float.parseFloat(minSimilarityString + "." + decimalComponent); + } catch (NumberFormatException e) { + // shouldn't ever happen. If it does, fall back to original value of + // slop + // swallow + } + } + + // if the user enters 2.4 for example, round it so that there won't be + // an + // illegalparameter exception + if (minSimilarity >= 1.0f) { + minSimilarity = (float) Math.round(minSimilarity); + } + + int prefixLen = getFuzzyPrefixLength(); + if (prefixLenString != null){ + try{ + prefixLen = Integer.parseInt(prefixLenString); + } catch (NumberFormatException e){ + //swallow + } + } + boolean transpositions = (transposString != null) ? false : true; + + q = getFuzzyQuery(field, term, minSimilarity, prefixLen, transpositions); + return q; + } + + // is this a wildcard term? + m = WILDCARD_PATTERN.matcher(termText); + Set ws = new HashSet(); + while (m.find()) { + if (! isCharEscaped(termText, m.start())){ + ws.add(m.start()); + } + } + if (ws.size() > 0) { + + if (ws.size() == 1 // there's only one wildcard character + && ws.contains(termText.length() - 1) // it isn't escaped + && termText.indexOf("*") == termText.length() - 1 // it is * not ? + && termText.length() > 1) { //it isn't just * by itself + // snip final * + q = getPrefixQuery(field, + termText.substring(0, termText.length() - 1)); + } else { + q = getWildcardQuery(field, termText); + } + } + // if you've found anything, return it + if (q != null) { + return q; + } + // treat as basic single term query + + return getFieldQuery(field, termText, quoted); + } + + + @Override + protected Query newTermQuery(Term t){ + t = unescape(t); + return new SpanTermQuery(t); + } + + @Override + protected Query getFieldQuery(String field, String termText, boolean quoted) + throws ParseException { + return newFieldQuery(getWholeTermAnalyzer(field), field, termText, quoted); + } + + @Override + protected Query getFieldQuery(String field, String queryText, int slop) + throws ParseException { + Query query = getFieldQuery(field, queryText, true); + + if (query instanceof SpanNearQuery) { + if (((SpanNearQuery)query).getSlop() != slop){ + slop = (slop > spanNearMaxDistance) ? spanNearMaxDistance : slop; + SpanQuery[] clauses = ((SpanNearQuery) query).getClauses(); + query = new SpanNearQuery(clauses, slop, true); + } + } + + return query; + } + /** + * Build what appears to be a simple single term query. If the analyzer breaks + * it into multiple terms, treat that as a "phrase" or as an "or" depending on + * the value of {@link #autoGeneratePhraseQueries}. + * + * Can return null! + * @param field + * @param termText + * @return query + * @throws ParseException + */ + @Override + protected Query newFieldQuery(Analyzer analyzer, String field, String termText, boolean quoted) + throws ParseException { + //largely plagiarized from QueryParserBase + TokenStream source; + try { + source = analyzer.tokenStream(field, termText); + source.reset(); + } catch (IOException e) { + ParseException p = new ParseException("Unable to initialize TokenStream to analyze query text"); + p.initCause(e); + throw p; + } + CachingTokenFilter buffer = new CachingTokenFilter(source); + TermToBytesRefAttribute termAtt = null; + PositionIncrementAttribute posIncrAtt = null; + OffsetAttribute offsetAtt = null; + int numTokens = 0; + + buffer.reset(); + + if (buffer.hasAttribute(TermToBytesRefAttribute.class)) { + termAtt = buffer.getAttribute(TermToBytesRefAttribute.class); + } + if (buffer.hasAttribute(PositionIncrementAttribute.class)) { + posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); + } + if (buffer.hasAttribute(OffsetAttribute.class)){ + offsetAtt = buffer.getAttribute(OffsetAttribute.class); + } + + boolean hasMoreTokens = false; + if (termAtt != null) { + try { + hasMoreTokens = buffer.incrementToken(); + while (hasMoreTokens) { + numTokens++; + hasMoreTokens = buffer.incrementToken(); + } + } catch (IOException e) { + // ignore + } + } + try { + // rewind the buffer stream + buffer.reset(); + //source.end(); + // close original stream - all tokens buffered + source.close(); + } + catch (IOException e) { + ParseException p = new ParseException("Cannot close TokenStream analyzing query text"); + p.initCause(e); + throw p; + } + + BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef(); + + if (numTokens == 0){ + if (throwExceptionForEmptyTerm){ + throw new ParseException("Couldn't find any content term in: "+ termText); + } + return null; + } else if (numTokens == 1) { + try { + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + termAtt.fillBytesRef(); + } catch (IOException e) { + // safe to ignore, because we know the number of tokens + } + return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))); + } else { + + List queries = new ArrayList(); + try{ + if (posIncrAtt != null){ + analyzeComplexSingleTerm(field, buffer, termAtt, bytes, posIncrAtt, queries); + } else if (offsetAtt != null){ + analyzeComplexSingleTerm(field, buffer, termAtt, bytes, offsetAtt, queries); + } else { + while (buffer.incrementToken()) { + termAtt.fillBytesRef(); + queries.add((SpanTermQuery)newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)))); + } + } + } catch (IOException e){ + //ignore + } + List nonEmpties = new LinkedList(); + for (SpanQuery piece : queries) { + if (piece != null) { + nonEmpties.add(piece); + } else if (piece == null && throwExceptionForEmptyTerm) { + throw new ParseException("Stop word found in " + termText); + } + } + + if (nonEmpties.size() == 0) { + return getEmptySpanQuery(); + } + if (nonEmpties.size() == 1) { + return nonEmpties.get(0); + } + SpanQuery[] ret = nonEmpties + .toArray(new SpanQuery[nonEmpties.size()]); + if (quoted || getAutoGeneratePhraseQueries() == true) { + return new SpanNearQuery(ret, 0, true); + } else { + return new SpanOrQuery(ret); + } + } + } + + + + private void analyzeComplexSingleTerm(String field, + CachingTokenFilter ts, TermToBytesRefAttribute termAtt, BytesRef bytes, + OffsetAttribute offAtt, + List queries) throws IOException { + int lastStart = -1; + while (ts.incrementToken()) { + termAtt.fillBytesRef(); + //if start is the same, treat it as a synonym...ignore end because + //of potential for shingles + if (lastStart > -1 && offAtt.startOffset() == lastStart) + //&& offAttr.endOffset() == lastEnd) + { + + handleSyn(queries, (SpanTermQuery)newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)))); + } else { + + queries.add((SpanTermQuery)newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)))); + } + lastStart = offAtt.startOffset(); + } + + } + + private void analyzeComplexSingleTerm(String field, + CachingTokenFilter ts, TermToBytesRefAttribute termAtt, BytesRef bytes, + PositionIncrementAttribute posAtt, + List queries) throws IOException{ + while (ts.incrementToken()) { + termAtt.fillBytesRef(); + if (posAtt.getPositionIncrement() == 0){ + handleSyn(queries, (SpanTermQuery)newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)))); + } else { + //add null for stop words + for (int i = 1; i < posAtt.getPositionIncrement(); i++) { + queries.add(null); + } + queries.add((SpanTermQuery)newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)))); + } + } + + } + + private void handleSyn(List queries, SpanQuery currQuery) { + assert(queries != null); + //grab the last query added to queries + SpanQuery last = null; + boolean removed = false; + if (queries.size() > 0){ + last = queries.remove(queries.size()-1); + removed = true; + } + //if it exists and does not equal null + if (last != null){ + if (last instanceof SpanOrQuery){ + ((SpanOrQuery)last).addClause(currQuery); + } else { + SpanQuery tmp = last; + last = new SpanOrQuery(); + ((SpanOrQuery)last).addClause(tmp); + ((SpanOrQuery)last).addClause(currQuery); + } + queries.add(last); + } else { + //if you actually removed a null, put it back on + if (removed){ + queries.add(null); + } + //then add the new term + queries.add(currQuery); + } + } + + /** + * + * @param clauses + * @return {@link org.apache.lucene.search.spans.SpanOrQuery} might be empty if clauses is null or contains + * only empty queries + */ + protected SpanQuery buildSpanOrQuery(List clauses) + throws ParseException{ + if (clauses == null || clauses.size() == 0) + return getEmptySpanQuery(); + + List nonEmpties = removeEmpties(clauses); + if (nonEmpties.size() == 0) { + return getEmptySpanQuery(); + } + if (nonEmpties.size() == 1) + return nonEmpties.get(0); + + SpanQuery[] arr = nonEmpties.toArray(new SpanQuery[nonEmpties.size()]); + return new SpanOrQuery(arr); + + } + + + protected SpanQuery buildSpanNearQuery(List clauses, int slop, + Boolean inOrder) throws ParseException { + if (clauses == null || clauses.size() == 0) + return getEmptySpanQuery(); + + List nonEmpties = removeEmpties(clauses); + + if (nonEmpties.size() == 0) { + return getEmptySpanQuery(); + } + if (nonEmpties.size() == 1){ + SpanQuery child = nonEmpties.get(0); + //if single child is itself a SpanNearQuery, inherit slop and inorder + if (child instanceof SpanNearQuery){ + SpanQuery[] childsClauses = ((SpanNearQuery)child).getClauses(); + child = new SpanNearQuery(childsClauses, slop, inOrder); + } + } + + if (slop == UNSPECIFIED_SLOP){ + slop = getPhraseSlop(); + } else if (slop > spanNearMaxDistance) { + slop = spanNearMaxDistance; + } + + boolean localInOrder = DEFAULT_IN_ORDER; + if (inOrder != UNSPECIFIED_IN_ORDER){ + localInOrder = inOrder.booleanValue(); + } + + SpanQuery[] arr = nonEmpties.toArray(new SpanQuery[nonEmpties.size()]); + return new SpanNearQuery(arr, slop, localInOrder); + } + + /** + * This is meant to "fix" two cases that might be surprising to a + * non-whitespace language speaker. If a user entered, e.g. "\u5927\u5B66"~3, + * and {@link #autoGeneratePhraseQueries} is set to true, then the parser + * would treat this recursively and yield [[\u5927\u5B66]]~3 by default. The user + * probably meant: find those two characters within three words of each other, + * not find those right next to each other and that hit has to be within three + * words of nothing. + * + * If a user entered the same thing and {@link #autoGeneratePhraseQueries} is + * set to false, then the parser would treat this as [(\u5927\u5B66)]~3: find + * one character or the other and then that hit has to be within three words + * of nothing...not the desired outcome * @param field + * + * + * @param termText this is the sole child of a SpanNearQuery as identified by a whitespace-based tokenizer + * @param ancestralSlop + * @param ancestralInOrder + * @return query + * @throws ParseException + */ + protected Query specialHandlingForSpanNearWithOneComponent(String field, + String termText, + int ancestralSlop, Boolean ancestralInOrder) throws ParseException { + Query q = newFieldQuery(getWholeTermAnalyzer(field), field, termText, true); + if (q instanceof SpanNearQuery){ + SpanQuery[] childsClauses = ((SpanNearQuery)q).getClauses(); + return buildSpanNearQuery(Arrays.asList(childsClauses), ancestralSlop, ancestralInOrder); + } + return q; + } + + /* protected Query specialHandlingForSpanNearWithOneComponent(String field, + String termText, int mySlop, Boolean myInOrder, + int ancestralSlop, Boolean ancestralInOrder) throws ParseException { + Query q = newFieldQuery(getAnalyzer(), field, termText, false); + if (q instanceof SpanNearQuery){ + SpanQuery[] childsClauses = ((SpanNearQuery)q).getClauses(); + if (mySlop == UNSPECIFIED_SLOP && myInOrder == UNSPECIFIED_IN_ORDER){ + return buildSpanNearQuery(Arrays.asList(childsClauses), ancestralSlop, ancestralInOrder); + } else { + return buildSpanNearQuery(Arrays.asList(childsClauses), mySlop, myInOrder); + } + } + return q; + } + */ + /** + * + * @param clauses + * @param pre + * @param post + * @return span not query + * @throws ParseException + */ + protected SpanQuery buildSpanNotNearQuery(List clauses, int pre, + int post) throws ParseException { + if (clauses.size() != 2) { + throw new ParseException( + String.format("SpanNotNear query must have two clauses. I count %d", + clauses.size())); + } + // if include is an empty query, treat this as just an empty query + if (isEmptyQuery(clauses.get(0))) { + return clauses.get(0); + } + // if exclude is an empty query, return include alone + if (isEmptyQuery(clauses.get(1))) { + return clauses.get(0); + } + + if (pre > spanNotNearMaxDistance) { + pre = spanNotNearMaxDistance; + } + if (post > spanNotNearMaxDistance) { + post = spanNotNearMaxDistance; + } + return new SpanNotQuery(clauses.get(0), clauses.get(1), pre, post); + } + + + private List removeEmpties(List queries) + throws ParseException{ + + List nonEmpties = new ArrayList(); + for (SpanQuery q : queries) { + if (!isEmptyQuery(q)) { + nonEmpties.add(q); + } else if (throwExceptionForEmptyTerm){ + throw new ParseException("Stop word or unparseable term found"); + } + } + return nonEmpties; + } + + public SpanQuery getEmptySpanQuery() { + SpanQuery q = new SpanOrQuery(new SpanTermQuery[0]); + return q; + } + + public boolean isEmptyQuery(Query q) { + if (q == null || + q instanceof SpanOrQuery && ((SpanOrQuery) q).getClauses().length == 0) { + return true; + } + return false; + } + + public static Term unescape(Term t){ + + String txt = t.text(); + try{ + UnescapedCharSequence un = EscapeQuerySyntaxImpl.discardEscapeChar(txt); + + if (! un.toString().equals(txt)){ + t = new Term(t.field(),un.toString()); + } + } catch (org.apache.lucene.queryparser.flexible.standard.parser.ParseException e){ + //swallow; + } + + return t; + } + + public static String unescape(String s){ + try{ + UnescapedCharSequence un = EscapeQuerySyntaxImpl.discardEscapeChar(s); + return un.toString(); + } catch (org.apache.lucene.queryparser.flexible.standard.parser.ParseException e){ + //swallow; + } + + return s; + + } + + + public int getSpanNearMaxDistance() { + return spanNearMaxDistance; + } + + public void setSpanNearMaxDistance(int spanNearMaxDistance) { + this.spanNearMaxDistance = spanNearMaxDistance; + } + + public int getSpanNotNearMaxDistance() { + return spanNotNearMaxDistance; + } + + public void setSpanNotNearMaxDistance(int spanNotNearMaxDistance) { + this.spanNotNearMaxDistance = spanNotNearMaxDistance; + } + + /** + * If the a term passes through the analyzer and nothing comes out, + * throw an exception or silently ignore the missing term. This can + * happen with stop words or with other strings that the analyzer + * ignores. + * + *

+ * This is applied only at the full term level. + *

+ * Currently, a parseException is thrown no matter the setting on this + * whenever an analyzer can't return a value for a multiterm query. + * + * @return throw exception if analyzer yields empty term + */ + public boolean getThrowExceptionForEmptyTerm() { + return throwExceptionForEmptyTerm; + } + + /** + * @see #getThrowExceptionForEmptyTerm() + * @param throwExceptionForEmptyTerm + */ + public void setThrowExceptionForEmptyTerm(boolean throwExceptionForEmptyTerm) { + this.throwExceptionForEmptyTerm = throwExceptionForEmptyTerm; + } + + protected static boolean isCharEscaped(String s, int i){ + int j = i; + int esc = 0; + while (--j >=0 && s.charAt(j) == '\\'){ + esc++; + } + if (esc % 2 == 0){ + return false; + } + return true; + } + /** + * Copied nearly exactly from FuzzyQuery's floatToEdits. + *

+ * There are two differences: + *

+ *

    + *
  1. FuzzyQuery's floatToEdits requires that the return value + * be <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE
  2. + *
  3. This adds a small amount so that nearly exact + * hits don't get floored: 0.80 for termLen 5 should = 1
  4. + *
      + * @param minimumSimilarity + * @param termLen + * @return edits + */ + public static int unboundedFloatToEdits(float minimumSimilarity, int termLen) { + if (minimumSimilarity >= 1f) { + return (int)minimumSimilarity; + } else if (minimumSimilarity == 0.0f) { + return 0; // 0 means exact, not infinite # of edits! + } else { + return (int)(0.00001f+(1f-minimumSimilarity) * termLen); + } + } + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanOnlyParser.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanOnlyParser.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanOnlyParser.java (revision 0) @@ -0,0 +1,96 @@ +package org.apache.lucene.queryparser.spans; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.queryparser.classic.CharStream; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.spans.tokens.SQPClause; +import org.apache.lucene.queryparser.spans.tokens.SQPOrClause; +import org.apache.lucene.queryparser.spans.tokens.SQPToken; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.util.Version; + +/** + * This is somewhat of a toy class that enables easy testing of the span only + * parsing components. This does not handle boolean operators (AND, NOT, OR, +/-), + * and it does not handle multiple fields. It also doesn't handle MatchAllDocsQueries. + *

      + * However, it does guarantee that a SpanQuery is returned. + *

      + * The functionality of this class was the initial offering in LUCENE-5205. + * + * + * @see SpanQueryParser + */ +public class SpanOnlyParser extends AbstractSpanQueryParser{ + + + private static final int MAX_QUERY_LENGTH_CHARS = 30000; + + + private String topLevelQueryString = ""; + + public SpanOnlyParser(Version matchVersion, String f, Analyzer a){ + init(matchVersion, f, a); + } + + public SpanOnlyParser(Version matchVersion, String f, Analyzer a, Analyzer multitermAnalyzer){ + init(matchVersion, f, a, multitermAnalyzer); + } + + @Override + public Query parse(String s) throws ParseException{ + topLevelQueryString = s; + Query q = TopLevelQuery(getField()); + assert(q == null || q instanceof SpanQuery); + return q; + } + + @Override + public void ReInit(CharStream stream) { + //this is crazy...convert string to char stream then back to string for processing + //the value from extending QueryParserBase was greater than this + //bit of craziness. + try { + int i = 0; + while(i++ < MAX_QUERY_LENGTH_CHARS){ + stream.readChar(); + } + } catch (IOException e) {} + topLevelQueryString = stream.GetImage(); + + } + + @Override + public Query TopLevelQuery(String field) throws ParseException { + + return _parsePureSpan(field, topLevelQueryString); + } + + + protected Query _parsePureSpan(String field, String queryString) throws ParseException{ + SpanQueryLexer lexer = new SpanQueryLexer(); + List tokens = lexer.getTokens(topLevelQueryString); + SQPClause overallClause = new SQPOrClause(0, tokens.size()); + return _parsePureSpanClause(tokens, field, overallClause); + } +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerminal.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerminal.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerminal.java (revision 0) @@ -0,0 +1,7 @@ +package org.apache.lucene.queryparser.spans.tokens; + +//stub interface to gather SQPTerm, SQPRegexTerm and SQPRangeTerm +//under the same umbrella +public class SQPTerminal extends SQPBoostableToken { + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBoostableToken.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBoostableToken.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBoostableToken.java (revision 0) @@ -0,0 +1,53 @@ +package org.apache.lucene.queryparser.spans.tokens; + +import org.apache.lucene.queryparser.spans.SpanQueryParserBase; + +public abstract class SQPBoostableToken implements SQPToken{ + private float boost = SpanQueryParserBase.UNSPECIFIED_BOOST; + + public void setBoost(float boost){ + this.boost = boost; + } + + public float getBoost(){ + return boost; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + Float.floatToIntBits(boost); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof SQPBoostableToken)) { + return false; + } + SQPBoostableToken other = (SQPBoostableToken) obj; + if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) { + return false; + } + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("SQPBoostableToken [boost="); + builder.append(boost); + builder.append("]"); + return builder.toString(); + } + + + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPClause.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPClause.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPClause.java (revision 0) @@ -0,0 +1,71 @@ +package org.apache.lucene.queryparser.spans.tokens; + +public abstract class SQPClause extends SQPBoostableToken { + + public static enum TYPE { PAREN, BRACKET, QUOTE}; + private final int tokenOffsetStart; + private int tokenOffsetEnd; + + public SQPClause(int tokenOffsetStart){ + this.tokenOffsetStart = tokenOffsetStart; + } + + public SQPClause(int tokenOffsetStart, int tokenOffsetEnd){ + this(tokenOffsetStart); + this.tokenOffsetEnd = tokenOffsetEnd; + } + public int getTokenOffsetStart(){ + return tokenOffsetStart; + } + + public int getTokenOffsetEnd(){ + return tokenOffsetEnd; + } + + public void setTokenOffsetEnd(int tokenOffsetEnd){ + this.tokenOffsetEnd = tokenOffsetEnd; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + tokenOffsetStart; + result = prime * result + tokenOffsetEnd; + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof SQPClause)) { + return false; + } + SQPClause other = (SQPClause) obj; + if (tokenOffsetStart != other.tokenOffsetStart) { + return false; + } + if (tokenOffsetEnd != other.tokenOffsetEnd) { + return false; + } + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("SQPClauseBase [charOffsetStart="); + builder.append(tokenOffsetStart); + builder.append(", tokenOffsetEnd="); + builder.append(tokenOffsetEnd); + builder.append("]"); + return builder.toString(); + } + + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOrClause.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOrClause.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOrClause.java (revision 0) @@ -0,0 +1,61 @@ +package org.apache.lucene.queryparser.spans.tokens; + +public class SQPOrClause extends SQPClause { + + public static final int DEFAULT_MINIMUM_NUMBER_SHOULD_MATCH = 1; + + private int minimumNumberShouldMatch = DEFAULT_MINIMUM_NUMBER_SHOULD_MATCH; + + public SQPOrClause(int tokenOffsetStart, int tokenOffsetEnd){ + super(tokenOffsetStart, tokenOffsetEnd); + } + + public int getMinimumNumberShouldMatch(){ + return minimumNumberShouldMatch; + } + + public void setMinimumNumberShouldMatch(int n){ + minimumNumberShouldMatch = n; + } + + public TYPE getType(){ + return TYPE.PAREN; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + minimumNumberShouldMatch; + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof SQPOrClause)) { + return false; + } + SQPOrClause other = (SQPOrClause) obj; + if (minimumNumberShouldMatch != other.minimumNumberShouldMatch) { + return false; + } + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("SQPOrClause [minimumNumberShouldMatch="); + builder.append(minimumNumberShouldMatch); + builder.append("]"); + return builder.toString(); + } + + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBooleanOpToken.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBooleanOpToken.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPBooleanOpToken.java (revision 0) @@ -0,0 +1,70 @@ +package org.apache.lucene.queryparser.spans.tokens; + +import org.apache.lucene.queryparser.spans.SpanQueryParserBase; + + + +public class SQPBooleanOpToken implements SQPToken{ + + private final int type; + public SQPBooleanOpToken(int type){ + this.type = type; + } + + public int getType(){ + return type; + } + + public boolean isConj(){ + if (type == SpanQueryParserBase.CONJ_AND || + type == SpanQueryParserBase.CONJ_OR){ + return true; + } + return false; + } + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + type; + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof SQPBooleanOpToken)) { + return false; + } + SQPBooleanOpToken other = (SQPBooleanOpToken) obj; + if (type != other.type) { + return false; + } + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("SQPBooleanOpToken [type="); + builder.append(type); + builder.append("]"); + return builder.toString(); + } + + public static boolean isMod(int i) { + if (i == SpanQueryParserBase.CONJ_AND || + i == SpanQueryParserBase.CONJ_OR){ + return false; + } + return true; + } + + + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPToken.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPToken.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPToken.java (revision 0) @@ -0,0 +1,7 @@ +package org.apache.lucene.queryparser.spans.tokens; + +//stub interface +public interface SQPToken{ + + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNearClause.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNearClause.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNearClause.java (revision 0) @@ -0,0 +1,97 @@ +package org.apache.lucene.queryparser.spans.tokens; + +public class SQPNearClause extends SQPClause{ + + + public static final Boolean UNSPECIFIED_IN_ORDER = null; + + + private final TYPE type; + + private final Boolean inOrder; + private final boolean hasParams; + private final int slop; + + public SQPNearClause(int tokenStartOffset, int tokenEndOffset, TYPE type, + boolean hasParams, Boolean inOrder, int slop){ + super(tokenStartOffset, tokenEndOffset); + this.type = type; + this.hasParams = hasParams; + this.inOrder = inOrder; + this.slop = slop; + } + + public TYPE getType() { + return type; + } + + public Boolean getInOrder() { + return inOrder; + } + + public boolean hasParams() { + return hasParams; + } + + public int getSlop() { + return slop; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + (hasParams ? 1231 : 1237); + result = prime * result + ((inOrder == null) ? 0 : inOrder.hashCode()); + result = prime * result + slop; + result = prime * result + ((type == null) ? 0 : type.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!super.equals(obj)) { + return false; + } + if (!(obj instanceof SQPNearClause)) { + return false; + } + SQPNearClause other = (SQPNearClause) obj; + if (hasParams != other.hasParams) { + return false; + } + if (inOrder == null) { + if (other.inOrder != null) { + return false; + } + } else if (!inOrder.equals(other.inOrder)) { + return false; + } + if (slop != other.slop) { + return false; + } + if (type != other.type) { + return false; + } + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("SQPNearClause [type="); + builder.append(type); + builder.append(", inOrder="); + builder.append(inOrder); + builder.append(", hasParams="); + builder.append(hasParams); + builder.append(", slop="); + builder.append(slop); + builder.append("]"); + return builder.toString(); + } + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPField.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPField.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPField.java (revision 0) @@ -0,0 +1,48 @@ +package org.apache.lucene.queryparser.spans.tokens; + +public class SQPField implements SQPToken{ + private final String field; + + public SQPField(String field){ + this.field = field; + } + + public String getField(){ + return field; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((field == null) ? 0 : field.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (!(obj instanceof SQPField)) + return false; + SQPField other = (SQPField) obj; + if (field == null) { + if (other.field != null) + return false; + } else if (!field.equals(other.field)) + return false; + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("SQPField [field="); + builder.append(field); + builder.append("]"); + return builder.toString(); + } + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRangeTerm.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRangeTerm.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRangeTerm.java (revision 0) @@ -0,0 +1,94 @@ +package org.apache.lucene.queryparser.spans.tokens; + +public class SQPRangeTerm extends SQPTerminal{ + private final String start; + private final String end; + private final boolean startInclusive; + private final boolean endInclusive; + + public SQPRangeTerm(String from, String to, boolean startInclusive, boolean endInclusive){ + this.start = from; + this.end = to; + this.startInclusive = startInclusive; + this.endInclusive = endInclusive; + } + + public String getStart(){ + return start; + } + + public String getEnd(){ + return end; + } + + public boolean getStartInclusive(){ + return startInclusive; + } + + public boolean getEndInclusive(){ + return endInclusive; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((end == null) ? 0 : end.hashCode()); + result = prime * result + (endInclusive ? 1231 : 1237); + result = prime * result + ((start == null) ? 0 : start.hashCode()); + result = prime * result + (startInclusive ? 1231 : 1237); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof SQPRangeTerm)) { + return false; + } + SQPRangeTerm other = (SQPRangeTerm) obj; + if (end == null) { + if (other.end != null) { + return false; + } + } else if (!end.equals(other.end)) { + return false; + } + if (endInclusive != other.endInclusive) { + return false; + } + if (start == null) { + if (other.start != null) { + return false; + } + } else if (!start.equals(other.start)) { + return false; + } + if (startInclusive != other.startInclusive) { + return false; + } + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("SQPRangeTerm [start="); + builder.append(start); + builder.append(", end="); + builder.append(end); + builder.append(", startInclusive="); + builder.append(startInclusive); + builder.append(", endInclusive="); + builder.append(endInclusive); + builder.append("]"); + return builder.toString(); + } + + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/package.html =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/package.html (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/package.html (revision 0) @@ -0,0 +1,25 @@ + + + + +

      +Classes primarily used by SpanQueryLexer. +

      + + + Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOpenClause.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOpenClause.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPOpenClause.java (revision 0) @@ -0,0 +1,53 @@ +package org.apache.lucene.queryparser.spans.tokens; + +public class SQPOpenClause extends SQPClause { + + + private final TYPE type; + + public SQPOpenClause(int startOffset, TYPE type){ + super(startOffset); + this.type = type; + } + + public TYPE getType() { + return type; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + ((type == null) ? 0 : type.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!super.equals(obj)) { + return false; + } + if (!(obj instanceof SQPOpenClause)) { + return false; + } + SQPOpenClause other = (SQPOpenClause) obj; + if (type != other.type) { + return false; + } + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("SQPOpenClause [type="); + builder.append(type); + builder.append("]"); + return builder.toString(); + } + + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPMatchAllDocsToken.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPMatchAllDocsToken.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPMatchAllDocsToken.java (revision 0) @@ -0,0 +1,5 @@ +package org.apache.lucene.queryparser.spans.tokens; + +public class SQPMatchAllDocsToken extends SQPTerminal{ + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRegexTerm.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRegexTerm.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPRegexTerm.java (revision 0) @@ -0,0 +1,52 @@ +package org.apache.lucene.queryparser.spans.tokens; + +public class SQPRegexTerm extends SQPTerminal{ + private String term; + public SQPRegexTerm(String t){ + this.term = t; + } + + public String getString(){ + return term; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof SQPRegexTerm)) { + return false; + } + SQPRegexTerm other = (SQPRegexTerm) obj; + if (term == null) { + if (other.term != null) { + return false; + } + } else if (!term.equals(other.term)) { + return false; + } + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("SQPRegexTerm [term="); + builder.append(term); + builder.append("]"); + return builder.toString(); + } + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNotNearClause.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNotNearClause.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPNotNearClause.java (revision 0) @@ -0,0 +1,82 @@ +package org.apache.lucene.queryparser.spans.tokens; + +public class SQPNotNearClause extends SQPClause{ + + public static final int NOT_DEFAULT = 0; + + private final TYPE type; + + private final int notPre; + private final int notPost; + + public SQPNotNearClause(int tokenStartOffset, int tokenEndOffset, TYPE type, + int notPre, int notPost){ + super(tokenStartOffset, tokenEndOffset); + this.type = type; + this.notPre = notPre; + this.notPost = notPost; + } + + public TYPE getType() { + return type; + } + + public int getNotPre() { + return notPre; + } + + public int getNotPost() { + return notPost; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + notPost; + result = prime * result + notPre; + result = prime * result + ((type == null) ? 0 : type.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!super.equals(obj)) { + return false; + } + if (!(obj instanceof SQPNotNearClause)) { + return false; + } + SQPNotNearClause other = (SQPNotNearClause) obj; + if (notPost != other.notPost) { + return false; + } + if (notPre != other.notPre) { + return false; + } + if (type != other.type) { + return false; + } + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("SQPNotNearClause [type="); + builder.append(type); + builder.append(", notPre="); + builder.append(notPre); + builder.append(", notPost="); + builder.append(notPost); + builder.append("]"); + builder.append( getTokenOffsetStart() + ": " + getTokenOffsetEnd()); + return builder.toString(); + } + + + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerm.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerm.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/tokens/SQPTerm.java (revision 0) @@ -0,0 +1,69 @@ +package org.apache.lucene.queryparser.spans.tokens; + +public class SQPTerm extends SQPTerminal{ + private final String string; + private boolean isQuoted = false; + + + public SQPTerm(String string){ + this.string = string; + } + + public String getString(){ + return string; + } + + public void setIsQuoted(boolean b){ + isQuoted = b; + } + public boolean isQuoted(){ + return isQuoted; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + (isQuoted ? 1231 : 1237); + result = prime * result + ((string == null) ? 0 : string.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof SQPTerm)) { + return false; + } + SQPTerm other = (SQPTerm) obj; + if (isQuoted != other.isQuoted) { + return false; + } + if (string == null) { + if (other.string != null) { + return false; + } + } else if (!string.equals(other.string)) { + return false; + } + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("SQPTerm [string="); + builder.append(string); + builder.append(", isQuoted="); + builder.append(isQuoted); + builder.append("]"); + return builder.toString(); + } + + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java (revision 0) @@ -0,0 +1,444 @@ +package org.apache.lucene.queryparser.spans; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.queryparser.classic.CharStream; +import org.apache.lucene.queryparser.classic.ParseException; + +import org.apache.lucene.queryparser.spans.tokens.SQPBooleanOpToken; +import org.apache.lucene.queryparser.spans.tokens.SQPBoostableToken; +import org.apache.lucene.queryparser.spans.tokens.SQPClause; +import org.apache.lucene.queryparser.spans.tokens.SQPField; +import org.apache.lucene.queryparser.spans.tokens.SQPMatchAllDocsToken; +import org.apache.lucene.queryparser.spans.tokens.SQPNearClause; +import org.apache.lucene.queryparser.spans.tokens.SQPOrClause; +import org.apache.lucene.queryparser.spans.tokens.SQPTerminal; +import org.apache.lucene.queryparser.spans.tokens.SQPToken; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.util.Version; + +/** + *

      This parser leverages the power of SpanQuery and can combine them with + * traditional boolean logic and multiple field information.This parser includes functionality from: + *

        + *
      • {@link org.apache.lucene.queryparser.classic.QueryParser classic QueryParser}: most of its syntax
      • + *
      • {@link org.apache.lucene.queryparser.surround.parser.QueryParser SurroundQueryParser}: recursive parsing for "near" and "not" clauses.
      • + *
      • {@link org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser}: + * can handle "near" queries that include multiterms ({@link org.apache.lucene.search.WildcardQuery}, + * {@link org.apache.lucene.search.FuzzyQuery}, {@link org.apache.lucene.search.RegexpQuery}).
      • + *
      • {@link org.apache.lucene.queryparser.analyzing.AnalyzingQueryParser}: has an option to analyze multiterms.
      • + *
      + * + *

      + * + *

      + * Background + * This parser was developed for the concordance/analytic search use case -- + * the user wants to see every time a span occurs. The basic approach of this parser is to build + * BooleanQueries comprised of SpanQueries. Aside from a MatchAllDocsQuery, there should be no other + * types of queries. + * + *

      + * + *

      With luck, this parser will be made obsolete with Lucene-2878, but until then, + * this parser fills a niche. + *

      + *

      + * One goal was to keep the syntax as close to Lucene's classic + * {@link org.apache.lucene.queryparser.classic.QueryParser} as possible. + * Another goal was to make analysis of multiterms a fundamental part of the parser + * {@link AnalyzingQueryParserBase}. + *

      + *

      Similarities and Differences

      + * + *

      Same as classic syntax: + *

        + *
      • term: test
      • + *
      • fuzzy: roam~0.8, roam~2
      • + *
      • wildcard: te?t, test*, t*st
      • + *
      • regex: /[mb]oat/
      • + *
      • phrase: "jakarta apache"
      • + *
      • phrase with slop: "jakarta apache"~3
      • + *
      • "or" clauses: jakarta apache
      • + *
      • grouping clauses: (jakarta apache)
      • + *
      • field: author:hatcher title:lucene
      • + *
      • boolean operators: (lucene AND apache) NOT jakarta + *
      • required/not required operators: +lucene +apache -jakarta
      • + *
      • boolean with field:(author:hatcher AND author:gospodnetic) AND title:lucene
      • + *
      + *

      + *

      Main additions in SpanQueryParser syntax vs. classic syntax: + *

        + *
      • Can require "in order" for phrases with slop with the ~> operator: "jakarta apache"~>3
      • + *
      • Can specify "not near" "bieber fever"!~3,10 :: + * find "bieber" but not if "fever" appears within 3 words before or + * 10 words after it.
      • + *
      • Fully recursive phrasal queries with [ and ]; as in: [[jakarta apache]~3 lucene]~>4 :: + * find "jakarta" within 3 words of "apache", and that hit has to be within four + * words before "lucene".
      • + *
      • Can also use [] for single level phrasal queries instead of "" as in: [jakarta apache]
      • + *
      • Can use "or" clauses in phrasal queries: "apache (lucene solr)"~3 :: + * find "apache" and then either "lucene" or "solr" within three words. + *
      • + *
      • Can use multiterms in phrasal queries: "jakarta~1 ap*che"~2
      • + *
      • Did I mention recursion: [[jakarta~1 ap*che]~2 (solr~ /l[ou]+[cs][en]+/)]~10 :: + * Find something like "jakarta" within two words of "ap*che" and that hit + * has to be within ten words of something like "solr" or that lucene regex.
      • + *
      • How about: "fever (travlota~2 disco "saturday night" beeber~1)"!~3,10 :: find fever but not if something like + * travlota or disco or "saturday night" or something like beeber appears within 3 words before or 10 words after.
      • + *
      • Can require at least x number of hits at boolean level: "apache AND (lucene solr tika)~2
      • + *
      + *

      + *

      + * Trivial additions: + *

        + *
      • Can specify prefix length in fuzzy queries: jakarta~1,2 (edit distance=1, prefix=2)
      • + *
      • Can specify prefix Optimal String Alignment (OSA) vs Levenshtein + * in fuzzy queries: jakarta~1 (OSA) vs jakarta~>1 (Levenshtein)
      • + *
      + * + * + *

      Limitations of SpanQueryParser compared with classic QueryParser: + *

        + *
      1. There is some learning curve to figure out the subtle differences in syntax between + * when one is within a phrase and when not. Including: + *
          + *
        1. Boolean operators are not allowed within phrases: "solr (apache AND lucene)". + * Consider rewriting:[solr [apache lucene]]
        2. + *
        3. Field information is not allowed within phrases.
        4. + *
        5. Minimum hit counts for boolean or queries are not allowed within phrases: [apache (lucene solr tika)~2]
        6. + *
        + *
      2. This parser is not built with .jj or the antlr parser framework. + * Regrettably, because it is generating a {@link org.apache.lucene.search.spans.SpanQuery}, + * it can't use all of the generalizable queryparser infrastructure that was added with Lucene 4.+.
      3. + *
      + *

      + *

      Stop word handling + *

      + *

      The user can choose to throw a {@link org.apache.lucene.queryparser.classic.ParseException} if a stop word is encountered. + * If SpanQueryParserBase.throwExceptionForEmptyTerm is set to false (default), the following should happen. + *

      + *

      + *

        + *
      • Term: "the" will return an empty SpanQuery (similar to classic queryparser)
      • + *
      • BooleanOr: (the apache jakarta) will drop the stop word and return a + * {@link org.apache.lucene.search.spans.SpanOrQuery} for "apache" + * or "jakarta" + *
      • SpanNear: "apache and jakarta" will drop the "and" and match on only "apache jakarta"
      • + *

      + *

      A parse exception is currently always thrown if the parser analyzes a multiterm, and a subcomponent of the + * multiterm has a stopword: the*tre + *

      + *

      Expert: Other subtle differences between SpanQueryParser and classic QueryParser. + *

        + *
      • Fuzzy queries with slop > 2 are handled by SlowFuzzyQuery. The developer can set the minFuzzySim to limit + * the maximum edit distance (i.e. turn off SlowFuzzyQuery by setting fuzzyMinSim = 2.0f.
      • + *
      • Fuzzy queries with edit distance >=1 are rounded so that an exception is not thrown.
      • + *
      + *

      + *

      Truly Expert: there are a few other very subtle differences that are documented in comments + * in the sourcecode in the header of AnalyzingQueryParser. + *

      + *

      + * NOTE You must add the sandbox jar to your class path to include + * the currently deprecated {@link org.apache.lucene.sandbox.queries.SlowFuzzyQuery}. + *

      + * + */ + +public class SpanQueryParser extends AbstractSpanQueryParser { + + /* + * Some subtle differences between classic QueryParser and SpanQueryParser + * + * 1) In a range query, this parser is not removing double quotes. + * [ "abc" TO "xyz" ] -> [abc TO xyz] in classic query parser, but remains the same in SpanQueryParser + * + * 2) The SpanQueryParser does not recognize quotes as a way to escape non-regexes. + * In classic syntax a path string of "/abc/def/ghi" is denoted by the double quotes; in + * SpanQueryParser, the user has to escape the / as in \/abc\/def\/ghi + * + * 3) The SpanQueryParser appears to be adding an escape to RangeTermQueries of *, as in: + * in classic "[ * TO y]" -> [* TO y] + * but in SpanQueryParser: + * "[ * TO y]" -> [\* TO y] + * + * SpanQueryParser's handling of this is the same as creating a new RangeTermQuery. + * However, it does return different docs than the query generated by classic. + * + * 4) "term^3~" is not handled. Boosts must currently come after fuzzy mods in SpanQueryParser. + * + * 5) SpanQueryParser rounds fuzzy sims that are > 1.0. This test fails: assertParseException("term~1.1") + * + * 6) SpanQueryParser adds a small amount to its own floatToEdits calculation + * so that near exact percentages (e.g. 80% of a 5 char word should yield 1) + * aren't floored and therefore miss. + * + * For SpanQueryParser, brwon~0.80 hits on brown; however, it does + * not hit with classic query parser. + * + * Unfortunately, like classic query parser, SpanQueryParser will fail to parse + * a token with an odd number of \ ending in a phrasal boundary (LUCENE-1189). + * + * The test case that is used in LUCENE-1189 is slightly different than the original + * issue: \"(name:[///mike\\\\\\\") or (name:\"alphonse\")"; + * + * + */ + private static final int MAX_QUERY_LENGTH_CHARS = 30000; + + private String topLevelQueryString; + + public SpanQueryParser(Version matchVersion, String f, Analyzer a){ + init(matchVersion, f, a); + } + + public SpanQueryParser(Version matchVersion, String f, Analyzer a, Analyzer multitermAnalyzer){ + init(matchVersion, f, a, multitermAnalyzer); + } + + @Override + public void ReInit(CharStream stream) { + //this is crazy...convert string to char stream then back to string for processing + //the value from extending QueryParserBase was greater than this + //bit of craziness. + try { + int i = 0; + while(i++ < MAX_QUERY_LENGTH_CHARS){ + stream.readChar(); + } + } catch (IOException e) {} + topLevelQueryString = stream.GetImage(); + } + + @Override + public Query TopLevelQuery(String field) throws ParseException { + + return _parse(field); + } + + @Override + public Query parse(String s) throws ParseException { + topLevelQueryString = s; + return TopLevelQuery(getField()); + } + + private Query _parse(String field) throws ParseException { + if (topLevelQueryString == null || topLevelQueryString.equals("")){ + return getEmptySpanQuery(); + } + SpanQueryLexer lexer = new SpanQueryLexer(); + List tokens = lexer.getTokens(topLevelQueryString); + //just for debugging + for (int i = 0; i < tokens.size(); i++){ + SQPToken t = tokens.get(i); + String end = ""; + Float boost = Float.NaN; + if (t instanceof SQPClause){ + end = Integer.toString(((SQPClause)t).getTokenOffsetEnd()); + } + if (t instanceof SQPBoostableToken){ + boost = ((SQPBoostableToken)t).getBoost(); + } + } + SQPClause overallClause = new SQPOrClause(0, tokens.size()); + return parseRecursively(tokens, field, overallClause); + } + + private Query parseRecursively(final List tokens, + String field, SQPClause clause) + throws ParseException{ + int start = clause.getTokenOffsetStart(); + int end = clause.getTokenOffsetEnd(); + testStartEnd(tokens, start, end); + List clauses = new ArrayList(); + int conj = CONJ_NONE; + int mods = MOD_NONE; + String tmpField = field; + int i = start; + while (i < end){ + Query q = null; + SQPToken token = tokens.get(i); + + //if boolean operator or field, update local buffers and continue + if (token instanceof SQPBooleanOpToken){ + SQPBooleanOpToken t = (SQPBooleanOpToken)token; + if (t.isConj()){ + conj = t.getType(); + mods = MOD_NONE; + } else { + mods = t.getType(); + } + i++; + continue; + } else if (token instanceof SQPField){ + tmpField = ((SQPField)token).getField(); + i++; + continue; + } + //if or clause, recur through tokens + if (token instanceof SQPOrClause){ + //recur! + SQPOrClause tmpOr = (SQPOrClause)token; + q = parseRecursively(tokens, tmpField, tmpOr); + + if (q instanceof BooleanQuery && tmpOr.getMinimumNumberShouldMatch() > 1){ + ((BooleanQuery)q).setMinimumNumberShouldMatch(tmpOr.getMinimumNumberShouldMatch()); + } + + if (q.getBoost() == 1.0f + && tmpOr.getBoost() != SpanQueryParserBase.UNSPECIFIED_BOOST){ + q.setBoost(tmpOr.getBoost()); + } + i = tmpOr.getTokenOffsetEnd(); + } else if (token instanceof SQPNearClause){ + SQPNearClause tmpNear = (SQPNearClause)token; + q = _parsePureSpanClause(tokens, tmpField, tmpNear); + i = tmpNear.getTokenOffsetEnd(); + + } else if (token instanceof SQPMatchAllDocsToken){ + //order matters SQPMatchAllDocs must be before terminal + q = new MatchAllDocsQuery(); + i++; + } else if (token instanceof SQPTerminal){ + SQPTerminal tmpTerm = (SQPTerminal)token; + q = buildTerminal(tmpField, tmpTerm); + i++; + } + if (! isEmptyQuery(q)){ + addClause(clauses, conj, mods, q); + } + //reset mods and conj and field + mods = MOD_NONE; + conj = CONJ_NONE; + tmpField = field; + } + + if (clauses.size() == 0){ + return getEmptySpanQuery(); + } + if (clauses.size() == 1 && + clauses.get(0).getOccur() != Occur.MUST_NOT){ + return clauses.get(0).getQuery(); + } + + BooleanQuery bq = new BooleanQuery(); + try{ + + for (BooleanClause bc : clauses){ + bq.add(bc); + } + } catch (BooleanQuery.TooManyClauses e){ + throw new ParseException(e.getMessage()); + } + + if (clause instanceof SQPOrClause){ + SQPOrClause tmpClause = (SQPOrClause)clause; + if (tmpClause.getMinimumNumberShouldMatch() > SQPOrClause.DEFAULT_MINIMUM_NUMBER_SHOULD_MATCH){ + bq.setMinimumNumberShouldMatch(tmpClause.getMinimumNumberShouldMatch()); + } + } + + return bq; + } + + + private void testStartEnd(List tokens, int start, int end) + throws ParseException { + + SQPToken s = tokens.get(start); + if (s instanceof SQPBooleanOpToken){ + int type = ((SQPBooleanOpToken)s).getType(); + if ( type == CONJ_AND || type == CONJ_OR){ + throw new ParseException("Can't start clause with AND or OR"); + } + } + + SQPToken e = tokens.get(end-1); + + if (e instanceof SQPField){ + throw new ParseException("Can't end clause with a field token"); + } + if (e instanceof SQPBooleanOpToken){ + throw new ParseException("Can't end clause with a boolean operator"); + } + + + } + + + /** + * Extracts the spans from the BooleanQueries that are not in Occur.NOT + * clauses for highlighting. This query should not be used for document retrieval + * and may return different documents than "parse." + * + * @param field + * @param queryString + * @return SpanQuery for highlighting + * @throws ParseException + */ + public SpanQuery getHighlightQuery(String field, String queryString) throws ParseException{ + Query q = parse(queryString); + List sqs = new ArrayList(); + extractSpanQueries(field, q, sqs); + return buildSpanOrQuery(sqs); + } + + /** + * Takes a query generated by this parser and extracts all + * SpanQueries into sqs that are not in a Boolean.Occur.NOT clause + * and that match the given field. + * + * The Query must consist of only BooleanQuery and SpanQuery objects!!! + * @param field + * @param query + * @param sqs + */ + private void extractSpanQueries(String field, Query query, List sqs) { + if (query == null){ + return; + } + if (query instanceof SpanQuery){ + SpanQuery sq = (SpanQuery)query; + if (! isEmptyQuery(sq) && + sq.getField().equals(field)){ + sqs.add((SpanQuery)query); + } + + } else if (query instanceof BooleanQuery){ + BooleanQuery bq = (BooleanQuery)query; + BooleanClause[] clauses = bq.getClauses(); + for (BooleanClause clause : clauses){ + if (clause.getOccur() != Occur.MUST_NOT){ + extractSpanQueries(field, clause.getQuery(), sqs); + } + } + } else { + //ignore + } + } + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryLexer.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryLexer.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryLexer.java (revision 0) @@ -0,0 +1,626 @@ +package org.apache.lucene.queryparser.spans; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.util.ArrayList; +import java.util.List; +import java.util.Stack; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.spans.tokens.SQPBooleanOpToken; +import org.apache.lucene.queryparser.spans.tokens.SQPBoostableToken; +import org.apache.lucene.queryparser.spans.tokens.SQPClause; +import org.apache.lucene.queryparser.spans.tokens.SQPClause.TYPE; +import org.apache.lucene.queryparser.spans.tokens.SQPField; +import org.apache.lucene.queryparser.spans.tokens.SQPMatchAllDocsToken; +import org.apache.lucene.queryparser.spans.tokens.SQPNearClause; +import org.apache.lucene.queryparser.spans.tokens.SQPNotNearClause; +import org.apache.lucene.queryparser.spans.tokens.SQPOpenClause; +import org.apache.lucene.queryparser.spans.tokens.SQPOrClause; +import org.apache.lucene.queryparser.spans.tokens.SQPRangeTerm; +import org.apache.lucene.queryparser.spans.tokens.SQPRegexTerm; +import org.apache.lucene.queryparser.spans.tokens.SQPTerm; +import org.apache.lucene.queryparser.spans.tokens.SQPTerminal; +import org.apache.lucene.queryparser.spans.tokens.SQPToken; +import org.apache.lucene.util.mutable.MutableValueInt; + +/** + * Tokenizer that returns a list of tokens of types: + * Term, RegexTerm, RangeTerm + * Boolean AND, NOT, etc + * Field + *

      + * A clause is represented as a node in the list where the clause started. + * The clause includes offsets into the list for where its contents start and end. + *

      + * Unescapes field and boolean operator tokens, but nothing else + * + *

      + * Identifies the following types of exceptions: + * mismatching/unmatched () "" [] + * bad unicode escape sequences + * some illegal conj and mods (and and) + * bad boosts: term^0.6^2 + * + * + * + */ +public class SpanQueryLexer { + + private final static String AND = "AND"; + private final static String NOT = "NOT"; + private final static String OR = "OR";//silently removed from queries...beware! + + private final static int DEFAULT_MIN_REQUIRED_IN_OR = 2; + + // private final static Pattern UNESCAPE = Pattern.compile("\\\\([-+\\p{Z}:\\\\\\(\\)\\[\\]])"); + private final static Pattern UNESCAPE_REGEX = Pattern.compile("\\\\(.)"); + + + private final static String BOOST = "\\^((?:\\d*\\.)?\\d+)"; + private final static Pattern TERM_BOOST = Pattern.compile("^(?:(?:\\\\.)|(?:[^\\\\]))*?(?:"+BOOST+")?$"); + + private final static String OPEN_PAREN = "("; + private final static String OPEN_BRACKET = "["; + private final static String CLOSE_BRACKET = "]"; + private final static String DQUOTE = "\""; + + //Groups + public enum G { + WHOLE, + ESCAPE, + MATCH_ALL_DOCS, + SPACE, + FIELD, + PLUS_MINUS, + RANGE_START, + RANGE_TERM_FROM, + RANGE_TERM_TO, + RANGE_END, + REGEX, + CLOSE_PAREN, + CLOSE_PAREN_DIGITS, + QUOTE_OR_CLOSING_BRACKET, + NEAR_PARAM, + NEAR_IN_ORDER, + NEAR_SLOP, + NOT_NEAR_PRE, + NOT_NEAR_POST, + BOOST, + OPEN_PAREN_OR_BRACKET, + + + }; + + //using \\p{Z} to capture wider variety of Unicode whitespace than \\s + //DO NOT DO THIS!!! Blew heap when a string had a character beyond bmp. + // private final static String TERMINAL_STRING = + //"((?:\\\\.|(?:[-+](?![/\\(\\[\"]))|[^-+\\\\\\(\\)\\[\\]\\p{Z}\"/:\\^])+)(?:(:)|"+BOOST+")?"; + + private final static Pattern ILLEGAL_END = Pattern.compile("^((?:\\\\.)|[^\\\\])*\\\\$"); + private final static Pattern ILLEGAL_UNICODE_ESCAPE = Pattern.compile("\\\\u([0-9a-fA-F]{0,4})");//{0,4})"); + + private final static String ESCAPE_STRING = "((?:\\\\.)+)"; + private final static String SPACE_STRING = "(\\p{Z}+)"; + private final static String MATCH_ALL_DOCS_STRING = "(\\*:\\*)"; + private final static String FIELD_END_STRING = ("(:)"); + private final static String REGEX_STRING = "(?:/((?:\\\\.|[^/\\\\])+?)/)"; + + private final static String RANGE_TERM = "((?:\\\\.|[^\\p{Z}\\(\\)\\[\\]{}])+)"; + private final static String RANGE_START_DELIMITER = "([\\[{])"; + private final static String RANGE_END_DELIMITER = "([\\]}])"; + + private final static String RANGE_STRING = "(?:"+RANGE_START_DELIMITER+ + "\\p{Z}*"+RANGE_TERM+"\\p{Z}+TO\\p{Z}+"+RANGE_TERM+"\\p{Z}*"+RANGE_END_DELIMITER+"(?!(?:~|!~)))"; + //plus/minus must not be followed by a space (to be boolean op) + private final static String PLUS_MINUS_STRING = "(?:([-+])(?!\\p{Z}))"; + private final static String OPENING = "([\\(\\[])"; + + private final static String NEAR_MODIFIERS = "~(?:(>)?(\\d+)?)?"; + private final static String NOT_NEAR_MODIFIERS = "!~(?:(\\d+)(?:,(\\d+))?)?"; + + private final static String NEAR_CLOSING_MODIFIERS = "("+NEAR_MODIFIERS+"|"+NOT_NEAR_MODIFIERS+")?"; + + private final static String OR_CLOSING_MODIFIER = "(?:~(\\d*))?"; + private final static String CLOSING_STRING = "(?:(\\))"+OR_CLOSING_MODIFIER+")|(?:([\\]\"])"+NEAR_CLOSING_MODIFIERS+")"; + + private final static Pattern TOKENIZER = Pattern.compile( + //TERMINAL_STRING +"|"+ + ESCAPE_STRING + "|" + MATCH_ALL_DOCS_STRING + "|" + SPACE_STRING + "|"+ + FIELD_END_STRING+"|"+ + PLUS_MINUS_STRING+ + "|(?:"+RANGE_STRING+"|"+ + REGEX_STRING +"|"+CLOSING_STRING + ")(?:"+BOOST+")?|"+OPENING); + + public List getTokens(String s) throws ParseException{ + Matcher m = ILLEGAL_END.matcher(s); + if (m.find()){ + throw new ParseException("Can't end query with unescaped backslash character"); + } + m = ILLEGAL_UNICODE_ESCAPE.matcher(s); + while (m.find()){ + if (m.group(1).length() != 4){ + throw new ParseException ("Illegal escaped unicode character: "+m.group(1)); + } + } + List tokens = new ArrayList(); + + Stack stack = new Stack(); + MutableValueInt nearDepth = new MutableValueInt(); + nearDepth.value = 0; + + m = TOKENIZER.matcher(s); + + int last = 0; + while (m.find()){ + + + if (m.group(G.SPACE.ordinal()) != null){ + //space + if (m.start() > last){ + String term = s.substring(last, m.start()); + addRawTerm(term, nearDepth.value, tokens); + } + last = m.end(); + } else if (m.group(G.ESCAPE.ordinal()) != null){ + //don't set last; keep going + } else if (m.group(G.FIELD.ordinal()) != null){ + if (m.start() > 0 && m.start() > last){ + String term = s.substring(last, m.start()); + addField(term, nearDepth.value, tokens); + last = m.end(); + } + + } else if (m.group(G.MATCH_ALL_DOCS.ordinal()) != null){ + tokens.add(new SQPMatchAllDocsToken()); + last = m.end(); + } else { + + if (m.start() > last){ + String term = s.substring(last, m.start()); + addRawTerm(term, nearDepth.value, tokens); + } + addOpTokens(m, tokens, stack, nearDepth); + + last = m.end(); + } + + } + if (last < s.length()){ + String term = s.substring(last); + addRawTerm(term, nearDepth.value, tokens); + } + + if (stack.size() != 0){ + //TODO add more info + throw new ParseException("unmatched bracket"); + } else if (nearDepth.value != 0){ + throw new ParseException("error in nearDepth calc"); + } + + testSingle(tokens); + return tokens; + } + + + private void addOpTokens(Matcher m, + List tokens, Stack stack, MutableValueInt nearDepth) + throws ParseException{ + + //these return early + //perhaps rearrange to more closely align with operator frequency + if (m.group(G.CLOSE_PAREN.ordinal()) != null){ + processCloseParen(m, tokens, stack, nearDepth.value); + return; + } else if (m.group(G.QUOTE_OR_CLOSING_BRACKET.ordinal()) != null){ + processCloseBracketOrQuote(m, tokens, stack, nearDepth); + return; + } + + SQPToken token = null; + + if (m.group(G.OPEN_PAREN_OR_BRACKET.ordinal()) != null){ + //open paren or open bracket + if (m.group(G.OPEN_PAREN_OR_BRACKET.ordinal()).equals(OPEN_PAREN)){ + token = new SQPOpenClause(tokens.size(), TYPE.PAREN); + } else if (m.group(G.OPEN_PAREN_OR_BRACKET.ordinal()).equals(OPEN_BRACKET)){ + token = new SQPOpenClause(tokens.size(), TYPE.BRACKET); + nearDepth.value++; + } else { + //should never happen !!! + } + stack.push((SQPOpenClause)token); + } else if (m.group(G.PLUS_MINUS.ordinal()) != null){ + String pm = m.group(G.PLUS_MINUS.ordinal()); + if (pm.equals("+")){ + token = new SQPBooleanOpToken(SpanQueryParserBase.MOD_REQ); + testBooleanTokens(tokens, (SQPBooleanOpToken)token); + } else if (pm.equals("-")){ + token = new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT); + testBooleanTokens(tokens, (SQPBooleanOpToken)token); + } + } else if (m.group(G.REGEX.ordinal()) != null){ + token = new SQPRegexTerm(unescapeRegex(m.group(G.REGEX.ordinal()))); + } else if (m.group(G.RANGE_TERM_FROM.ordinal()) != null){ + boolean startInclusive = false; + boolean endInclusive = false; + if (m.group(G.RANGE_START.ordinal()).equals(OPEN_BRACKET)){ + startInclusive = true; + } + if (m.group(G.RANGE_END.ordinal()).equals(CLOSE_BRACKET)){ + endInclusive = true; + } + + token = new SQPRangeTerm(m.group(G.RANGE_TERM_FROM.ordinal()), m.group(G.RANGE_TERM_TO.ordinal()), + startInclusive, endInclusive); + } + + if (token != null){ + tryToSetBoost(token, m.group(G.BOOST.ordinal())); + tokens.add(token); + } + } + + private void processCloseBracketOrQuote(Matcher m, List tokens, + Stack stack, MutableValueInt nearDepth) throws ParseException { + //open or close quote or closing bracket + //let's start with quote + if (m.group(G.QUOTE_OR_CLOSING_BRACKET.ordinal()).equals(DQUOTE)){ + processDQuote(m, tokens, stack, nearDepth); + return; + } + //from here on out, must be close bracket + //test for mismatched + if (stack.isEmpty()){ + throw new ParseException("Couldn't find matching open bracket/quote."); + } + + SQPOpenClause open = stack.pop(); + if (open.getType() != TYPE.BRACKET){ + //TODO: improve error message + throw new ParseException("Was expecting matching bracket!"); + } + + SQPClause clause = buildNearOrNotNear(m, tokens, open); + + tryToSetBoost(open, m.group(G.BOOST.ordinal())); + nearDepth.value--; + tokens.set(open.getTokenOffsetStart(), clause); + } + + + private void processDQuote(Matcher m, List tokens, + Stack stack, MutableValueInt nearDepth) throws ParseException{ + //If a double-quote, don't know if open or closing yet + //first test to see if there's a matching open quote on the stack + //if there is, this must be a closing quote + //if there isn't, push whatever was on the stack back and + //treat this as an opening quote + if (stack.size() > 0){ + SQPOpenClause openCand = stack.pop(); + if (openCand.getType() == TYPE.QUOTE){ + processDQuoteClose(m, tokens, openCand, nearDepth); + return; + + } + //put candidate back on the stack + stack.push(openCand); + } + //by this point, we know that this double quote must be an opener + SQPOpenClause token = new SQPOpenClause(tokens.size(), TYPE.QUOTE); + + stack.push(token); + nearDepth.value++; + tokens.add(token); + } + + private void processDQuoteClose(Matcher m, List tokens, + SQPOpenClause open, MutableValueInt nearDepth) throws ParseException{ + SQPClause clause = buildNearOrNotNear(m, tokens, open); + //special handling if a single term between double quotes + //and the double quotes don't have any parameters + if (clause instanceof SQPNearClause && + ! ((SQPNearClause)clause).hasParams() && + open.getTokenOffsetStart() == tokens.size()-2 && + tokens.size()-2 >=0){ + boolean abort = false; + SQPToken content = tokens.get(tokens.size()-1); + if (content instanceof SQPRegexTerm){ + //add back in the original / and / + content = new SQPTerm(escapeDQuote("/"+((SQPRegexTerm)content).getString())+"/"); + } else if (content instanceof SQPTerm){ + content = new SQPTerm(escapeDQuote(((SQPTerm)content).getString())); + } else { + abort = true; + } + if (abort == false){ + //remove the last content token + tokens.remove(tokens.size()-1); + //remove the opening clause marker + tokens.remove(tokens.size()-1); + tokens.add(content); + if (clause.getBoost() != SpanQueryParserBase.UNSPECIFIED_BOOST && + ((SQPBoostableToken)content).getBoost() == SpanQueryParserBase.UNSPECIFIED_BOOST){ + ((SQPBoostableToken)content).setBoost(clause.getBoost()); + } + ((SQPTerm)content).setIsQuoted(true); + nearDepth.value--; + return; + } + } + + nearDepth.value--; + tokens.set(open.getTokenOffsetStart(), clause); + } + + + private SQPClause buildNearOrNotNear(Matcher m, List tokens, SQPOpenClause open) + throws ParseException{ + //try for not near first, return early + if (m.group(G.NEAR_PARAM.ordinal()) != null && m.group(G.NEAR_PARAM.ordinal()).startsWith("!")){ + int notPre = SQPNotNearClause.NOT_DEFAULT; + int notPost = SQPNotNearClause.NOT_DEFAULT; + if (m.group(G.NOT_NEAR_PRE.ordinal()) != null){ + notPre = Integer.parseInt(m.group(G.NOT_NEAR_PRE.ordinal())); + notPost = notPre; + } + if (m.group(G.NOT_NEAR_POST.ordinal()) != null){ + notPost = Integer.parseInt(m.group(G.NOT_NEAR_POST.ordinal())); + } + //contents of this clause start at 1 after tokenOffsetStart + SQPNotNearClause clause = new SQPNotNearClause(open.getTokenOffsetStart()+1, tokens.size(), + open.getType(), notPre, notPost); + tryToSetBoost((SQPBoostableToken)clause, m.group(G.BOOST.ordinal())); + return clause; + } + + //must be near + //if nothing is specified, inOrder == true + Boolean inOrder = SQPNearClause.UNSPECIFIED_IN_ORDER; + int slop = AbstractSpanQueryParser.UNSPECIFIED_SLOP; + boolean hasParams = false; + if (m.group(G.NEAR_PARAM.ordinal()) != null){ + hasParams = true; + inOrder = new Boolean(false); + } + + if (m.group(G.NEAR_SLOP.ordinal()) != null){ + slop = Integer.parseInt(m.group(G.NEAR_SLOP.ordinal())); + } + + if (m.group(G.NEAR_IN_ORDER.ordinal()) != null){ + inOrder = new Boolean(true); + } + SQPNearClause clause = new SQPNearClause(open.getTokenOffsetStart()+1, tokens.size(), + open.getType(), hasParams, inOrder, slop); + tryToSetBoost((SQPBoostableToken)clause, m.group(G.BOOST.ordinal())); + return clause; + } + + private void tryToSetBoost(SQPToken open, String boostString) throws ParseException{ + if (boostString == null || boostString.length() == 0){ + return; + } + + if (open instanceof SQPBoostableToken){ + try{ + float b = Float.parseFloat(boostString); + ((SQPBoostableToken)open).setBoost(b); + } catch (NumberFormatException e){ + //if the regex works properly, this shoudl never happen + throw new ParseException("Unable to parse number in boost: " + boostString); + } + } + } + + + + private void processCloseParen(Matcher m, List tokens, + Stack stack, int nearDepth) throws ParseException { + if (stack.isEmpty()){ + throw new ParseException("Mismatched closing paren"); + } + SQPOpenClause openCand = stack.pop(); + if (openCand.getType() == TYPE.PAREN){ + SQPOrClause clause = new SQPOrClause(openCand.getTokenOffsetStart()+1, + tokens.size()); + if (m.group(G.CLOSE_PAREN_DIGITS.ordinal()) != null){ + throwIfNear(nearDepth, + "Can't specify minimum number of terms for an 'or' clause within a 'near' clause"); + + if (m.group(G.CLOSE_PAREN_DIGITS.ordinal()).length() > 0){ + clause.setMinimumNumberShouldMatch(Integer.parseInt(m.group(G.CLOSE_PAREN_DIGITS.ordinal()))); + } else { + clause.setMinimumNumberShouldMatch(DEFAULT_MIN_REQUIRED_IN_OR); + } + } + tryToSetBoost(clause, m.group(G.BOOST.ordinal())); + tokens.set(openCand.getTokenOffsetStart(), clause); + return; + } + throw new ParseException("Was expecting \")\" but found " + openCand.getType()); + } + + private void throwIfNear(int nearDepth, String string) throws ParseException{ + if (nearDepth != 0){ + throw new ParseException(string); + } + } + + private void addField(String term, int nearDepth, List tokens) throws ParseException{ + if (nearDepth != 0){ + throw new ParseException("Can't specify a field within a \"Near\" clause"); + } + if (tokens.size() > 0 && tokens.get(tokens.size()-1) instanceof SQPField){ + throw new ParseException("A field must contain a terminal"); + } + SQPToken token = new SQPField(SpanQueryParserBase.unescape(term)); + tokens.add(token); + } + + private void addRawTerm(String term, int nearDepth, List tokens) + throws ParseException{ + //The regex over-captures on a term...Term could be: + //AND or NOT boolean operator; and term could have boost + + //does the term == AND or NOT or OR + if (nearDepth == 0){ + SQPToken token = null; + if (term.equals(AND)){ + token = new SQPBooleanOpToken(SpanQueryParserBase.CONJ_AND); + } else if (term.equals(NOT)){ + token = new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT); + } else if (term.equals(OR)){ + token = new SQPBooleanOpToken(SpanQueryParserBase.CONJ_OR); + } + if (token != null){ + testBooleanTokens(tokens, (SQPBooleanOpToken)token); + tokens.add(token); + return; + } + + } + SQPToken token = null; + Matcher m = TERM_BOOST.matcher(term); + + int boosts = 0; + while (m.find()){ + + if (m.group(1) != null){ + token = new SQPTerm(unescape(term.substring(0, m.start(1)-1))); + tryToSetBoost((SQPBoostableToken)token, m.group(1)); + boosts++; + if (m.start(1) == 1 && m.end(1) == term.length()){ + throw new ParseException("Can't have a boost as a standalone term"); + } + } + + } + if (boosts > 1){ + throw new ParseException("Can't have more than one boost on a term"); + } + if (token == null){ + token = new SQPTerm(unescape(term)); + } + + + tokens.add(token); + } + + /** + * Test whether this token can be added to the list of tokens + * based on classic queryparser rules + * @param tokens + * @param token + * @throws ParseException + */ + private void testBooleanTokens(List tokens, SQPBooleanOpToken token) + throws ParseException { + //there are possible exceptions with tokens.size()==0, but they + //are the same exceptions as at clause beginning. + //Need to test elsewhere for start of clause issues. + if (tokens.size() == 0){ + return; + } + SQPToken t = tokens.get(tokens.size()-1); + if (t instanceof SQPBooleanOpToken){ + int curr = ((SQPBooleanOpToken)t).getType(); + int nxt = token.getType(); + boolean ex = false; + if (SQPBooleanOpToken.isMod(curr)){ + ex = true; + } else if ( curr == SpanQueryParser.CONJ_AND && + nxt == SpanQueryParser.CONJ_AND){ + ex = true; + } else if( curr == SpanQueryParser.CONJ_OR && + ! SQPBooleanOpToken.isMod(nxt) ){ + ex = true; + } else if (curr == SpanQueryParser.MOD_NOT){ + ex = true; + } + if (ex == true){ + throw new ParseException("Illegal combination of boolean conjunctions and modifiers"); + } + } + } + + private void testSingle(List tokens) throws ParseException{ + if (tokens.size() == 0){ + return; + } + if (tokens.size() == 1){ + SQPToken t = tokens.get(0); + if (t instanceof SQPTerminal){ + } else { + throw new ParseException("Must have at least one terminal"); + } + } + } + + private String unescape(String s){ + if (s.equals("\\AND")){ + return "AND"; + } + if (s.equals("\\NOT")){ + return "NOT"; + } + if (s.equals("\\OR")){ + return "OR"; + } + return s; + } + + private String unescapeRegex(String s){ + + Matcher m = UNESCAPE_REGEX.matcher(s); + StringBuilder sb = new StringBuilder(); + int last = 0; + while (m.find()){ + sb.append(s.substring(last, m.start(0))); + if (m.group(1).equals("/")){ + sb.append("/"); + } else { + sb.append("\\").append(m.group(1)); + } + + last = m.end(1); + } + if (last == 0){ + return s; + } + sb.append(s.substring(last)); + return sb.toString(); + } + + private String escapeDQuote(String s) { + //copied from escape in QueryParserBase. Had to remove \\ + //to handle quoted single terms. + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + // These characters are part of the query syntax and must be escaped + if (c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' + || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~' + || c == '*' || c == '?' || c == '|' || c == '&' || c == '/') { + sb.append('\\'); + } + sb.append(c); + } + return sb.toString(); + } +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AbstractSpanQueryParser.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AbstractSpanQueryParser.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AbstractSpanQueryParser.java (revision 0) @@ -0,0 +1,183 @@ +package org.apache.lucene.queryparser.spans; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.queryparser.classic.ParseException; + +import org.apache.lucene.queryparser.spans.tokens.SQPBoostableToken; +import org.apache.lucene.queryparser.spans.tokens.SQPClause; +import org.apache.lucene.queryparser.spans.tokens.SQPNearClause; +import org.apache.lucene.queryparser.spans.tokens.SQPNotNearClause; +import org.apache.lucene.queryparser.spans.tokens.SQPOrClause; +import org.apache.lucene.queryparser.spans.tokens.SQPRangeTerm; +import org.apache.lucene.queryparser.spans.tokens.SQPRegexTerm; +import org.apache.lucene.queryparser.spans.tokens.SQPTerm; +import org.apache.lucene.queryparser.spans.tokens.SQPTerminal; +import org.apache.lucene.queryparser.spans.tokens.SQPToken; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; + +public abstract class AbstractSpanQueryParser extends SpanQueryParserBase { + + @Override + abstract public Query parse(String s) throws ParseException; + + + /** + *Recursively called to parse a span query + * + * This assumes that there are no FIELD tokens and no BOOLEAN operators + * @param tokens + * @param field + * @param parentClause + * @return SpanQuery + * @throws ParseException + */ + protected SpanQuery _parsePureSpanClause(final List tokens, + String field, SQPClause parentClause) + throws ParseException{ + + int start = parentClause.getTokenOffsetStart(); + int end = parentClause.getTokenOffsetEnd(); + if (end-start == 1){ + + if (parentClause instanceof SQPNearClause){ + SQPNearClause nc = (SQPNearClause)parentClause; + SQPToken t = tokens.get(start); + if (t instanceof SQPTerm){ + + SpanQuery ret = trySpecialHandlingForSpanNearWithOneComponent(field, (SQPTerm)t, nc); + if (ret != null){ + if (parentClause.getBoost() != SpanQueryParserBase.UNSPECIFIED_BOOST){ + ret.setBoost(parentClause.getBoost()); + } + return ret; + } + } + } + } + + List queries = new ArrayList(); + int i = start; + while (i < end){ + SQPToken t = tokens.get(i); + SpanQuery q = null; + if (t instanceof SQPClause){ + SQPClause c = (SQPClause)t; + q = _parsePureSpanClause(tokens, field, c); + i = c.getTokenOffsetEnd(); + } else if (t instanceof SQPTerminal){ + q = buildTerminal(field, (SQPTerminal)t); + i++; + } else { + throw new ParseException("Can't process field, boolean operators or a match all docs query in a pure span."); + } + if (q != null){ + queries.add(q); + } + } + if (queries == null || queries.size() == 0){ + return getEmptySpanQuery(); + } + return buildSpanQueryClause(queries, parentClause); + } + + + private SpanQuery trySpecialHandlingForSpanNearWithOneComponent(String field, + SQPTerm token, SQPNearClause clause) + throws ParseException{ + + int slop = (clause.getSlop() == SpanQueryParserBase.UNSPECIFIED_SLOP) ? getPhraseSlop() : clause.getSlop(); + boolean order = clause.getInOrder() == null ? true : clause.getInOrder().booleanValue(); + + SpanQuery ret = (SpanQuery)specialHandlingForSpanNearWithOneComponent(field, + token.getString(), slop, order); + return ret; + + } + + protected SpanQuery buildTerminal(String field, SQPTerminal token) throws ParseException{ + + + Query q = null; + if (token instanceof SQPRegexTerm){ + q = getRegexpQuery(field, ((SQPRegexTerm)token).getString()); + } else if (token instanceof SQPTerm){ + q = buildAnySingleTermQuery(field, ((SQPTerm)token).getString(), ((SQPTerm)token).isQuoted()); + } else if (token instanceof SQPRangeTerm){ + SQPRangeTerm rt = (SQPRangeTerm)token; + q = getRangeQuery(field, rt.getStart(), rt.getEnd(), + rt.getStartInclusive(), rt.getEndInclusive()); + } + if (q != null && token instanceof SQPBoostableToken){ + float boost = ((SQPBoostableToken)token).getBoost(); + if (boost != SpanQueryParserBase.UNSPECIFIED_BOOST){ + q.setBoost(boost); + } + } + if (q != null && q instanceof SpanQuery){ + return (SpanQuery)q; + } + return null; + } + + private SpanQuery buildSpanQueryClause(List queries, SQPClause clause) + throws ParseException { + SpanQuery q = null; + if (clause instanceof SQPOrClause){ + q = buildSpanOrQuery(queries); + } else if (clause instanceof SQPNearClause){ + + int slop = ((SQPNearClause)clause).getSlop(); + if (slop == UNSPECIFIED_SLOP){ + slop = getPhraseSlop(); + } + Boolean inOrder = ((SQPNearClause)clause).getInOrder(); + boolean order = false; + if (inOrder == null){ + order = slop > 0 ? false : true; + } else { + order = inOrder.booleanValue(); + } + q = buildSpanNearQuery(queries, + slop, order); + } else if (clause instanceof SQPNotNearClause){ + q = buildSpanNotNearQuery(queries, + ((SQPNotNearClause)clause).getNotPre(), + ((SQPNotNearClause)clause).getNotPost()); + + } + + if (clause.getBoost() != UNSPECIFIED_BOOST){ + q.setBoost(clause.getBoost()); + } + //now update boost if clause only had one child + if (q.getBoost() == UNSPECIFIED_BOOST && + clause.getBoost() != UNSPECIFIED_BOOST && ( + q instanceof SpanTermQuery || + q instanceof SpanMultiTermQueryWrapper)){ + q.setBoost(clause.getBoost()); + } + + return q; + } + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AnalyzingQueryParserBase.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AnalyzingQueryParserBase.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AnalyzingQueryParserBase.java (revision 0) @@ -0,0 +1,301 @@ +package org.apache.lucene.queryparser.spans; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParserBase; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Version; + +/** + * Enables setting different Analyzers for different fields. + *

      + * Enables setting different analyzers for whole term vs. + * multiTerm (wildcard, fuzzy, prefix). + * + */ +public abstract class AnalyzingQueryParserBase extends QueryParserBase{ + + public enum NORM_MULTI_TERMS { + ANALYZE, + LOWERCASE, + NONE + }; + + private Map wholeTermAnalyzers = new HashMap(); + private Map multiTermAnalyzers = new HashMap(); + private NORM_MULTI_TERMS normMultiTerms = NORM_MULTI_TERMS.LOWERCASE; + + private final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)"); + + + private Analyzer multiTermAnalyzer; + /** + * Default initialization. The analyzer is used for both whole terms and multiTerms. + */ + @Override + public void init(Version matchVersion, String f, Analyzer a) { + super.init(matchVersion, f, a); + this.multiTermAnalyzer = a; + + } + + /** + * Expert. Set a different analyzer for whole terms vs. multiTerm subcomponents. + *

      + * This initializer has a side effect of setting normMultiTerms = NORM_MULTI_TERMS.ANALYZE + * @param matchVersion + * @param f + * @param a + * @param multiTermAnalyzer + */ + public void init(Version matchVersion, String f, Analyzer a, Analyzer multiTermAnalyzer) { + super.init(matchVersion, f, a); + this.multiTermAnalyzer = multiTermAnalyzer; + setNormMultiTerms(NORM_MULTI_TERMS.ANALYZE); + } + + /** + * Notionally overrides functionality from analyzeMultitermTerm. Differences + * are that this consumes the full tokenstream, and it throws ParseException + * if it encounters no content terms or more than one. + * + * @param field + * @param part + * @param analyzerIn + * @return bytesRef to term part + * @throws ParseException, RuntimeException + */ + protected BytesRef analyzeMultitermTermParseEx(String field, String part, Analyzer analyzerIn) + throws ParseException { + //TODO: In QueryParserBase, analyzeMultiTerm doesn't currently consume all tokens, and it + //throws RuntimeExceptions and IllegalArgumentExceptions instead of parse. + //Otherwise this is copied verbatim. + TokenStream source; + + if (analyzerIn == null) analyzerIn = getMultiTermAnalyzer(); + + try { + source = analyzerIn.tokenStream(field, part); + source.reset(); + } catch (IOException e) { + throw new ParseException("Unable to initialize TokenStream to analyze multiTerm term: " + part); + } + + TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); + BytesRef bytes = termAtt.getBytesRef(); + + int partCount = 0; + try { + if (!source.incrementToken()){ + + } else { + partCount++; + termAtt.fillBytesRef(); + while (source.incrementToken()){ + partCount++; + } + + } + } catch (IOException e1) { + throw new RuntimeException("Error analyzing multiterm: " + part); + } + + try { + source.end(); + source.close(); + } catch (IOException e) { + throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part); + } + if (partCount != 1){ + throw new ParseException("Couldn't find any content in >"+ part+"<"); + } + return BytesRef.deepCopyOf(bytes); + } + + //TODO: make this protected in QueryParserBase and then override it + //modify to throw only parse exception + protected BytesRef analyzeMultitermTermParseEx(String field, String part) throws ParseException{ + BytesRef b = null; + try{ + b = analyzeMultitermTermParseEx(field, part, getMultiTermAnalyzer(field)); + } catch (IllegalArgumentException e){ + throw new ParseException("Couldn't find any content in >"+ part+"<"); + } + return b; + } + + /** + * Analysis of wildcards is a bit tricky. This splits a term by wildcard + * and then analyzes the subcomponents. + * + * @param field + * @param termText + * @return analyzed wildcard + * @throws ParseException + */ + protected String analyzeWildcard(String field, String termText) throws ParseException { + // plagiarized from AnalyzingQueryParser + Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(termText); + StringBuilder sb = new StringBuilder(); + int last = 0; + + while (wildcardMatcher.find()) { + // continue if escaped char + if (wildcardMatcher.group(1) != null) { + continue; + } + + if (wildcardMatcher.start() > 0) { + String chunk = termText.substring(last, wildcardMatcher.start()); + BytesRef analyzed = analyzeMultitermTermParseEx(field, chunk); + sb.append(analyzed.utf8ToString()); + } + // append the wildcard character + sb.append(wildcardMatcher.group(2)); + + last = wildcardMatcher.end(); + } + if (last < termText.length()) { + sb.append(analyzeMultitermTermParseEx(field, termText.substring(last)).utf8ToString()); + } + return sb.toString(); + } + + /** + * Set different analyzers for different fields. During parsing, if a field isn't + * found in this map, the default getAnalyzer() analyzer is used. + * + * @param wholeTermAnalyzers + */ + public void setAnalyzers(Map wholeTermAnalyzers){ + this.wholeTermAnalyzers = wholeTermAnalyzers; + } + + /** + * Expert. Set different analyzers (wholeTerm and multiTerm) for different fields. + * During parsing, if a field isn't found in wholeTermAnalyzers, getAnalyzer() is called. + * If a field isn't found in multiTermAnalyzers, then getMultiTermAnalyzer() is called. + *

      + * As a side effect, this sets normMultiTerms to NORM_MULTI_TERMS.ANALYZE + *

      + * If set to null, the default analyzer will be used for all fields. + * + * @param wholeTermAnalyzers + * @param multiTermAnalyzers + */ + public void setAnalyzers(Map wholeTermAnalyzers, Map multiTermAnalyzers){ + this.wholeTermAnalyzers = wholeTermAnalyzers; + this.multiTermAnalyzers = multiTermAnalyzers; + normMultiTerms = NORM_MULTI_TERMS.ANALYZE; + } + /** + * If set to true, normMultiTerms is set to NORM_MULTI_TERMS.LOWERCASE. + * If set to false, this turns off all normalization and sets normMultiTerms to NORM_MULTI_TERMS.NONE. + * + * @deprecated use {@link #setNormMultiTerms(NORM_MULTI_TERMS)} + */ + @Override + @Deprecated + public void setLowercaseExpandedTerms(boolean lc){ + if (lc == true){ + normMultiTerms = NORM_MULTI_TERMS.LOWERCASE; + } else { + normMultiTerms = NORM_MULTI_TERMS.NONE; + } + super.setLowercaseExpandedTerms(lc); + } + + /** + * Returns true if normMultiTerms == NORM_MULTI_TERMS.LOWERCASE + * @deprecated use {@link #getNormMultiTerms()} + */ + @Override + @Deprecated + public boolean getLowercaseExpandedTerms(){ + if (normMultiTerms == NORM_MULTI_TERMS.LOWERCASE){ + return true; + } + return false; + } + + /** + * + * @return analyzer to use for multiTerms if a field isn't specified or + * not found in the multiTermAnalyzers map. + */ + public Analyzer getMultiTermAnalyzer() { + return multiTermAnalyzer; + } + + /** + * + * @return type of normalization to perform on multiTerms + */ + public NORM_MULTI_TERMS getNormMultiTerms() { + return normMultiTerms; + } + + public void setNormMultiTerms(NORM_MULTI_TERMS norm) { + this.normMultiTerms = norm; + //TODO: get rid of these side effects once deprecated setLowercaseExpandedTerms is gone. + //These are currently needed because (at least) regexp creation + //is driven by QueryParserBase, which still relies on these. + if (norm == NORM_MULTI_TERMS.LOWERCASE){ + setLowercaseExpandedTerms(true); + } else if (norm == NORM_MULTI_TERMS.NONE){ + setLowercaseExpandedTerms(false); + } + } + + /** + * + * @param field + * @return analyzer to use on a requested field for whole terms. Returns getAnalyzer() if + * field is not found in wholeTermAnalyzers. + */ + public Analyzer getWholeTermAnalyzer(String field){ + if (wholeTermAnalyzers != null && + wholeTermAnalyzers.containsKey(field)){ + return wholeTermAnalyzers.get(field); + } + return getAnalyzer(); + } + + /** + * + * @param field + * @return analyzer to use on a requested field for multiTerm terms. Returns getMultiTermAnalyzer() + * if field is not found in multiTermAnalyzers + */ + public Analyzer getMultiTermAnalyzer(String field){ + if (multiTermAnalyzers != null && + multiTermAnalyzers.containsKey(field)){ + return multiTermAnalyzers.get(field); + } + return getMultiTermAnalyzer(); + } + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/package.html =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/package.html (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/package.html (revision 0) @@ -0,0 +1,28 @@ + + + + +

      +SpanQueryParser is the main class in this package. +

      +

      +The SpanOnlyParser parses a subset of the overall syntax (no boolean logic, no field info and no *:*) +

      + + +