Index: src/test/org/apache/lucene/queryParser/TestQueryParser.java =================================================================== --- src/test/org/apache/lucene/queryParser/TestQueryParser.java (revision 653997) +++ src/test/org/apache/lucene/queryParser/TestQueryParser.java (working copy) @@ -33,6 +33,7 @@ import java.io.IOException; import java.io.Reader; import java.text.DateFormat; +import java.text.Collator; import java.util.Calendar; import java.util.Date; import java.util.Locale; @@ -409,6 +410,51 @@ assertQueryEquals("( bar blar { a TO z}) ", null, "bar blar {a TO z}"); assertQueryEquals("gack ( bar blar { a TO z}) ", null, "gack (bar blar {a TO z})"); } + + public void testFarsiRangeCollating() throws Exception { + + RAMDirectory ramDir = new RAMDirectory(); + IndexWriter iw = new IndexWriter(ramDir, new WhitespaceAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field("content","\u0633\u0627\u0628", + Field.Store.YES, Field.Index.UN_TOKENIZED)); + iw.addDocument(doc); + iw.close(); + IndexSearcher is = new IndexSearcher(ramDir); + + QueryParser qp = new QueryParser("content", new WhitespaceAnalyzer()); + + // Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in + // RuleBasedCollator. However, the Arabic Locale seems to order the Farsi + // characters properly. + Collator c = Collator.getInstance(new Locale("ar")); + qp.setRangeCollator(c); + + // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi + // orders the U+0698 character before the U+0633 character, so the single + // index Term below should NOT be returned by a ConstantScoreRangeQuery + // with a Farsi Collator (or an Arabic one for the case when Farsi is not + // supported). + + // Test ConstantScoreRangeQuery + qp.setUseOldRangeQuery(false); + Hits result = is.search(qp.parse("[ \u062F TO \u0698 ]")); + assertEquals("The index Term should not be included.", 0, result.length()); + + result = is.search(qp.parse("[ \u0633 TO \u0638 ]")); + assertEquals("The index Term should be included.", 1, result.length()); + + // Test RangeQuery + qp.setUseOldRangeQuery(true); + result = is.search(qp.parse("[ \u062F TO \u0698 ]")); + assertEquals("The index Term should not be included.", 0, result.length()); + + result = is.search(qp.parse("[ \u0633 TO \u0638 ]")); + assertEquals("The index Term should be included.", 1, result.length()); + + is.close(); + } /** for testing legacy DateField support */ private String getLegacyDate(String s) throws Exception { Index: src/test/org/apache/lucene/search/TestRangeFilter.java =================================================================== --- src/test/org/apache/lucene/search/TestRangeFilter.java (revision 653997) +++ src/test/org/apache/lucene/search/TestRangeFilter.java (working copy) @@ -18,9 +18,16 @@ */ import java.io.IOException; +import java.text.Collator; +import java.util.Locale; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.store.RAMDirectory; /** * A basic 'positive' Unit test class for the RangeFilter class. @@ -42,7 +49,7 @@ public void testRangeFilterId() throws IOException { - IndexReader reader = IndexReader.open(index); + IndexReader reader = IndexReader.open(signedIndex.index); IndexSearcher search = new IndexSearcher(reader); int medId = ((maxId - minId) / 2); @@ -122,13 +129,96 @@ } + public void testRangeFilterIdCollating() throws IOException { + + IndexReader reader = IndexReader.open(signedIndex.index); + IndexSearcher search = new IndexSearcher(reader); + + Collator c = Collator.getInstance(Locale.ENGLISH); + + int medId = ((maxId - minId) / 2); + + String minIP = pad(minId); + String maxIP = pad(maxId); + String medIP = pad(medId); + + int numDocs = reader.numDocs(); + + assertEquals("num of docs", numDocs, 1+ maxId - minId); + + Hits result; + Query q = new TermQuery(new Term("body","body")); + + // test id, bounded on both ends + + result = search.search(q,new RangeFilter("id",minIP,maxIP,T,T,c)); + assertEquals("find all", numDocs, result.length()); + + result = search.search(q,new RangeFilter("id",minIP,maxIP,T,F,c)); + assertEquals("all but last", numDocs-1, result.length()); + + result = search.search(q,new RangeFilter("id",minIP,maxIP,F,T,c)); + assertEquals("all but first", numDocs-1, result.length()); + + result = search.search(q,new RangeFilter("id",minIP,maxIP,F,F,c)); + assertEquals("all but ends", numDocs-2, result.length()); + + result = search.search(q,new RangeFilter("id",medIP,maxIP,T,T,c)); + assertEquals("med and up", 1+ maxId-medId, result.length()); + + result = search.search(q,new RangeFilter("id",minIP,medIP,T,T,c)); + assertEquals("up to med", 1+ medId-minId, result.length()); + + // unbounded id + + result = search.search(q,new RangeFilter("id",minIP,null,T,F,c)); + assertEquals("min and up", numDocs, result.length()); + + result = search.search(q,new RangeFilter("id",null,maxIP,F,T,c)); + assertEquals("max and down", numDocs, result.length()); + + result = search.search(q,new RangeFilter("id",minIP,null,F,F,c)); + assertEquals("not min, but up", numDocs-1, result.length()); + + result = search.search(q,new RangeFilter("id",null,maxIP,F,F,c)); + assertEquals("not max, but down", numDocs-1, result.length()); + + result = search.search(q,new RangeFilter("id",medIP,maxIP,T,F,c)); + assertEquals("med and up, not max", maxId-medId, result.length()); + + result = search.search(q,new RangeFilter("id",minIP,medIP,F,T,c)); + assertEquals("not min, up to med", medId-minId, result.length()); + + // very small sets + + result = search.search(q,new RangeFilter("id",minIP,minIP,F,F,c)); + assertEquals("min,min,F,F", 0, result.length()); + result = search.search(q,new RangeFilter("id",medIP,medIP,F,F,c)); + assertEquals("med,med,F,F", 0, result.length()); + result = search.search(q,new RangeFilter("id",maxIP,maxIP,F,F,c)); + assertEquals("max,max,F,F", 0, result.length()); + + result = search.search(q,new RangeFilter("id",minIP,minIP,T,T,c)); + assertEquals("min,min,T,T", 1, result.length()); + result = search.search(q,new RangeFilter("id",null,minIP,F,T,c)); + assertEquals("nul,min,F,T", 1, result.length()); + + result = search.search(q,new RangeFilter("id",maxIP,maxIP,T,T,c)); + assertEquals("max,max,T,T", 1, result.length()); + result = search.search(q,new RangeFilter("id",maxIP,null,T,F,c)); + assertEquals("max,nul,T,T", 1, result.length()); + + result = search.search(q,new RangeFilter("id",medIP,medIP,T,T,c)); + assertEquals("med,med,T,T", 1, result.length()); + } + public void testRangeFilterRand() throws IOException { - IndexReader reader = IndexReader.open(index); + IndexReader reader = IndexReader.open(signedIndex.index); IndexSearcher search = new IndexSearcher(reader); - String minRP = pad(minR); - String maxRP = pad(maxR); + String minRP = pad(signedIndex.minR); + String maxRP = pad(signedIndex.maxR); int numDocs = reader.numDocs(); @@ -184,4 +274,106 @@ } + public void testRangeFilterRandCollating() throws IOException { + + // using the unsigned index because collation seems to ignore hyphens + IndexReader reader = IndexReader.open(unsignedIndex.index); + IndexSearcher search = new IndexSearcher(reader); + + Collator c = Collator.getInstance(Locale.ENGLISH); + + String minRP = pad(unsignedIndex.minR); + String maxRP = pad(unsignedIndex.maxR); + + int numDocs = reader.numDocs(); + + assertEquals("num of docs", numDocs, 1+ maxId - minId); + + Hits result; + Query q = new TermQuery(new Term("body","body")); + + // test extremes, bounded on both ends + + result = search.search(q,new RangeFilter("rand",minRP,maxRP,T,T,c)); + assertEquals("find all", numDocs, result.length()); + + result = search.search(q,new RangeFilter("rand",minRP,maxRP,T,F,c)); + assertEquals("all but biggest", numDocs-1, result.length()); + + result = search.search(q,new RangeFilter("rand",minRP,maxRP,F,T,c)); + assertEquals("all but smallest", numDocs-1, result.length()); + + result = search.search(q,new RangeFilter("rand",minRP,maxRP,F,F,c)); + assertEquals("all but extremes", numDocs-2, result.length()); + + // unbounded + + result = search.search(q,new RangeFilter("rand",minRP,null,T,F,c)); + assertEquals("smallest and up", numDocs, result.length()); + + result = search.search(q,new RangeFilter("rand",null,maxRP,F,T,c)); + assertEquals("biggest and down", numDocs, result.length()); + + result = search.search(q,new RangeFilter("rand",minRP,null,F,F,c)); + assertEquals("not smallest, but up", numDocs-1, result.length()); + + result = search.search(q,new RangeFilter("rand",null,maxRP,F,F,c)); + assertEquals("not biggest, but down", numDocs-1, result.length()); + + // very small sets + + result = search.search(q,new RangeFilter("rand",minRP,minRP,F,F,c)); + assertEquals("min,min,F,F", 0, result.length()); + result = search.search(q,new RangeFilter("rand",maxRP,maxRP,F,F,c)); + assertEquals("max,max,F,F", 0, result.length()); + + result = search.search(q,new RangeFilter("rand",minRP,minRP,T,T,c)); + assertEquals("min,min,T,T", 1, result.length()); + result = search.search(q,new RangeFilter("rand",null,minRP,F,T,c)); + assertEquals("nul,min,F,T", 1, result.length()); + + result = search.search(q,new RangeFilter("rand",maxRP,maxRP,T,T,c)); + assertEquals("max,max,T,T", 1, result.length()); + result = search.search(q,new RangeFilter("rand",maxRP,null,T,F,c)); + assertEquals("max,nul,T,T", 1, result.length()); + } + + public void testFarsi() throws Exception { + + /* build an index */ + RAMDirectory farsiIndex = new RAMDirectory(); + IndexWriter writer = new IndexWriter(farsiIndex, new SimpleAnalyzer(), T, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field("content","\u0633\u0627\u0628", + Field.Store.YES, Field.Index.UN_TOKENIZED)); + doc.add(new Field("body", "body", + Field.Store.YES, Field.Index.UN_TOKENIZED)); + writer.addDocument(doc); + + writer.optimize(); + writer.close(); + + IndexReader reader = IndexReader.open(farsiIndex); + IndexSearcher search = new IndexSearcher(reader); + Query q = new TermQuery(new Term("body","body")); + + // Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in + // RuleBasedCollator. However, the Arabic Locale seems to order the Farsi + // characters properly. + Collator collator = Collator.getInstance(new Locale("ar")); + + // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi + // orders the U+0698 character before the U+0633 character, so the single + // index Term below should NOT be returned by a RangeFilter with a Farsi + // Collator (or an Arabic one for the case when Farsi is not supported). + Hits result = search.search + (q, new RangeFilter("content", "\u062F", "\u0698", T, T, collator)); + assertEquals("The index Term should not be included.", 0, result.length()); + + result = search.search + (q, new RangeFilter("content", "\u0633", "\u0638", T, T, collator)); + assertEquals("The index Term should be included.", 1, result.length()); + search.close(); + } } Index: src/test/org/apache/lucene/search/TestRangeQuery.java =================================================================== --- src/test/org/apache/lucene/search/TestRangeQuery.java (revision 653997) +++ src/test/org/apache/lucene/search/TestRangeQuery.java (working copy) @@ -26,6 +26,8 @@ import org.apache.lucene.util.LuceneTestCase; import java.io.IOException; +import java.util.Locale; +import java.text.Collator; /** * @author goller @@ -132,6 +134,78 @@ assertFalse("queries with different inclusive are not equal", query.equals(other)); } + public void testExclusiveCollating() throws Exception { + Query query = new RangeQuery(new Term("content", "A"), + new Term("content", "C"), + false, Collator.getInstance(Locale.ENGLISH)); + initializeIndex(new String[] {"A", "B", "C", "D"}); + IndexSearcher searcher = new IndexSearcher(dir); + Hits hits = searcher.search(query); + assertEquals("A,B,C,D, only B in range", 1, hits.length()); + searcher.close(); + + initializeIndex(new String[] {"A", "B", "D"}); + searcher = new IndexSearcher(dir); + hits = searcher.search(query); + assertEquals("A,B,D, only B in range", 1, hits.length()); + searcher.close(); + + addDoc("C"); + searcher = new IndexSearcher(dir); + hits = searcher.search(query); + assertEquals("C added, still only B in range", 1, hits.length()); + searcher.close(); + } + + public void testInclusiveCollating() throws Exception { + Query query = new RangeQuery(new Term("content", "A"), + new Term("content", "C"), + true, Collator.getInstance(Locale.ENGLISH)); + + initializeIndex(new String[]{"A", "B", "C", "D"}); + IndexSearcher searcher = new IndexSearcher(dir); + Hits hits = searcher.search(query); + assertEquals("A,B,C,D - A,B,C in range", 3, hits.length()); + searcher.close(); + + initializeIndex(new String[]{"A", "B", "D"}); + searcher = new IndexSearcher(dir); + hits = searcher.search(query); + assertEquals("A,B,D - A and B in range", 2, hits.length()); + searcher.close(); + + addDoc("C"); + searcher = new IndexSearcher(dir); + hits = searcher.search(query); + assertEquals("C added - A, B, C in range", 3, hits.length()); + searcher.close(); + } + + public void testFarsi() throws Exception { + // Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in + // RuleBasedCollator. However, the Arabic Locale seems to order the Farsi + // characters properly. + Collator collator = Collator.getInstance(new Locale("ar")); + Query query = new RangeQuery(new Term("content", "\u062F"), + new Term("content", "\u0698"), + true, collator); + // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi + // orders the U+0698 character before the U+0633 character, so the single + // index Term below should NOT be returned by a RangeQuery with a Farsi + // Collator (or an Arabic one for the case when Farsi is not supported). + initializeIndex(new String[]{ "\u0633\u0627\u0628"}); + IndexSearcher searcher = new IndexSearcher(dir); + Hits hits = searcher.search(query); + assertEquals("The index Term should not be included.", 0, hits.length()); + + query = new RangeQuery(new Term("content", "\u0633"), + new Term("content", "\u0638"), + true, collator); + hits = searcher.search(query); + assertEquals("The index Term should be included.", 1, hits.length()); + searcher.close(); + } + private void initializeIndex(String[] values) throws IOException { IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); for (int i = 0; i < values.length; i++) { @@ -156,5 +230,3 @@ docCount++; } } - - Index: src/test/org/apache/lucene/search/TestConstantScoreRangeQuery.java =================================================================== --- src/test/org/apache/lucene/search/TestConstantScoreRangeQuery.java (revision 653997) +++ src/test/org/apache/lucene/search/TestConstantScoreRangeQuery.java (working copy) @@ -18,6 +18,7 @@ */ import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; @@ -27,6 +28,8 @@ import org.apache.lucene.store.RAMDirectory; import java.io.IOException; +import java.text.Collator; +import java.util.Locale; import junit.framework.Assert; @@ -92,12 +95,25 @@ return new ConstantScoreRangeQuery(f,l,h,il,ih); } + /** macro for readability */ + public static Query csrq(String f, String l, String h, + boolean il, boolean ih, Collator c) { + return new ConstantScoreRangeQuery(f,l,h,il,ih,c); + } + public void testBasics() throws IOException { QueryUtils.check(csrq("data","1","6",T,T)); QueryUtils.check(csrq("data","A","Z",T,T)); QueryUtils.checkUnequal(csrq("data","1","6",T,T), csrq("data","A","Z",T,T)); } + public void testBasicsCollating() throws IOException { + Collator c = Collator.getInstance(Locale.ENGLISH); + QueryUtils.check(csrq("data","1","6",T,T,c)); + QueryUtils.check(csrq("data","A","Z",T,T,c)); + QueryUtils.checkUnequal(csrq("data","1","6",T,T,c), csrq("data","A","Z",T,T,c)); + } + public void testEqualScores() throws IOException { // NOTE: uses index build in *this* setUp @@ -205,7 +221,7 @@ public void testRangeQueryId() throws IOException { // NOTE: uses index build in *super* setUp - IndexReader reader = IndexReader.open(index); + IndexReader reader = IndexReader.open(signedIndex.index); IndexSearcher search = new IndexSearcher(reader); int medId = ((maxId - minId) / 2); @@ -284,21 +300,104 @@ } + public void testRangeQueryIdCollating() throws IOException { + // NOTE: uses index build in *super* setUp + + IndexReader reader = IndexReader.open(signedIndex.index); + IndexSearcher search = new IndexSearcher(reader); + + int medId = ((maxId - minId) / 2); + + String minIP = pad(minId); + String maxIP = pad(maxId); + String medIP = pad(medId); + + int numDocs = reader.numDocs(); + + assertEquals("num of docs", numDocs, 1+ maxId - minId); + + Hits result; + + Collator c = Collator.getInstance(Locale.ENGLISH); + + // test id, bounded on both ends + + result = search.search(csrq("id",minIP,maxIP,T,T,c)); + assertEquals("find all", numDocs, result.length()); + + result = search.search(csrq("id",minIP,maxIP,T,F,c)); + assertEquals("all but last", numDocs-1, result.length()); + + result = search.search(csrq("id",minIP,maxIP,F,T,c)); + assertEquals("all but first", numDocs-1, result.length()); + + result = search.search(csrq("id",minIP,maxIP,F,F,c)); + assertEquals("all but ends", numDocs-2, result.length()); + + result = search.search(csrq("id",medIP,maxIP,T,T,c)); + assertEquals("med and up", 1+ maxId-medId, result.length()); + + result = search.search(csrq("id",minIP,medIP,T,T,c)); + assertEquals("up to med", 1+ medId-minId, result.length()); + + // unbounded id + + result = search.search(csrq("id",minIP,null,T,F,c)); + assertEquals("min and up", numDocs, result.length()); + + result = search.search(csrq("id",null,maxIP,F,T,c)); + assertEquals("max and down", numDocs, result.length()); + + result = search.search(csrq("id",minIP,null,F,F,c)); + assertEquals("not min, but up", numDocs-1, result.length()); + + result = search.search(csrq("id",null,maxIP,F,F,c)); + assertEquals("not max, but down", numDocs-1, result.length()); + + result = search.search(csrq("id",medIP,maxIP,T,F,c)); + assertEquals("med and up, not max", maxId-medId, result.length()); + + result = search.search(csrq("id",minIP,medIP,F,T,c)); + assertEquals("not min, up to med", medId-minId, result.length()); + + // very small sets + + result = search.search(csrq("id",minIP,minIP,F,F,c)); + assertEquals("min,min,F,F,c", 0, result.length()); + result = search.search(csrq("id",medIP,medIP,F,F,c)); + assertEquals("med,med,F,F,c", 0, result.length()); + result = search.search(csrq("id",maxIP,maxIP,F,F,c)); + assertEquals("max,max,F,F,c", 0, result.length()); + + result = search.search(csrq("id",minIP,minIP,T,T,c)); + assertEquals("min,min,T,T,c", 1, result.length()); + result = search.search(csrq("id",null,minIP,F,T,c)); + assertEquals("nul,min,F,T,c", 1, result.length()); + + result = search.search(csrq("id",maxIP,maxIP,T,T,c)); + assertEquals("max,max,T,T,c", 1, result.length()); + result = search.search(csrq("id",maxIP,null,T,F,c)); + assertEquals("max,nul,T,T,c", 1, result.length()); + + result = search.search(csrq("id",medIP,medIP,T,T,c)); + assertEquals("med,med,T,T,c", 1, result.length()); + } + + public void testRangeQueryRand() throws IOException { // NOTE: uses index build in *super* setUp - IndexReader reader = IndexReader.open(index); + IndexReader reader = IndexReader.open(signedIndex.index); IndexSearcher search = new IndexSearcher(reader); - String minRP = pad(minR); - String maxRP = pad(maxR); + String minRP = pad(signedIndex.minR); + String maxRP = pad(signedIndex.maxR); int numDocs = reader.numDocs(); assertEquals("num of docs", numDocs, 1+ maxId - minId); Hits result; - Query q = new TermQuery(new Term("body","body")); // test extremes, bounded on both ends @@ -347,4 +446,104 @@ } + public void testRangeQueryRandCollating() throws IOException { + // NOTE: uses index build in *super* setUp + + // using the unsigned index because collation seems to ignore hyphens + IndexReader reader = IndexReader.open(unsignedIndex.index); + IndexSearcher search = new IndexSearcher(reader); + + String minRP = pad(unsignedIndex.minR); + String maxRP = pad(unsignedIndex.maxR); + + int numDocs = reader.numDocs(); + + assertEquals("num of docs", numDocs, 1+ maxId - minId); + + Hits result; + + Collator c = Collator.getInstance(Locale.ENGLISH); + + // test extremes, bounded on both ends + + result = search.search(csrq("rand",minRP,maxRP,T,T,c)); + assertEquals("find all", numDocs, result.length()); + + result = search.search(csrq("rand",minRP,maxRP,T,F,c)); + assertEquals("all but biggest", numDocs-1, result.length()); + + result = search.search(csrq("rand",minRP,maxRP,F,T,c)); + assertEquals("all but smallest", numDocs-1, result.length()); + + result = search.search(csrq("rand",minRP,maxRP,F,F,c)); + assertEquals("all but extremes", numDocs-2, result.length()); + + // unbounded + + result = search.search(csrq("rand",minRP,null,T,F,c)); + assertEquals("smallest and up", numDocs, result.length()); + + result = search.search(csrq("rand",null,maxRP,F,T,c)); + assertEquals("biggest and down", numDocs, result.length()); + + result = search.search(csrq("rand",minRP,null,F,F,c)); + assertEquals("not smallest, but up", numDocs-1, result.length()); + + result = search.search(csrq("rand",null,maxRP,F,F,c)); + assertEquals("not biggest, but down", numDocs-1, result.length()); + + // very small sets + + result = search.search(csrq("rand",minRP,minRP,F,F,c)); + assertEquals("min,min,F,F,c", 0, result.length()); + result = search.search(csrq("rand",maxRP,maxRP,F,F,c)); + assertEquals("max,max,F,F,c", 0, result.length()); + + result = search.search(csrq("rand",minRP,minRP,T,T,c)); + assertEquals("min,min,T,T,c", 1, result.length()); + result = search.search(csrq("rand",null,minRP,F,T,c)); + assertEquals("nul,min,F,T,c", 1, result.length()); + + result = search.search(csrq("rand",maxRP,maxRP,T,T,c)); + assertEquals("max,max,T,T,c", 1, result.length()); + result = search.search(csrq("rand",maxRP,null,T,F,c)); + assertEquals("max,nul,T,T,c", 1, result.length()); + } + + public void testFarsi() throws Exception { + + /* build an index */ + RAMDirectory farsiIndex = new RAMDirectory(); + IndexWriter writer = new IndexWriter(farsiIndex, new SimpleAnalyzer(), T, + IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field("content","\u0633\u0627\u0628", + Field.Store.YES, Field.Index.UN_TOKENIZED)); + doc.add(new Field("body", "body", + Field.Store.YES, Field.Index.UN_TOKENIZED)); + writer.addDocument(doc); + + writer.optimize(); + writer.close(); + + IndexReader reader = IndexReader.open(farsiIndex); + IndexSearcher search = new IndexSearcher(reader); + + // Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in + // RuleBasedCollator. However, the Arabic Locale seems to order the Farsi + // characters properly. + Collator c = Collator.getInstance(new Locale("ar")); + + // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi + // orders the U+0698 character before the U+0633 character, so the single + // index Term below should NOT be returned by a ConstantScoreRangeQuery + // with a Farsi Collator (or an Arabic one for the case when Farsi is + // not supported). + Hits result = search.search(csrq("content","\u062F", "\u0698", T, T, c)); + assertEquals("The index Term should not be included.", 0, result.length()); + + result = search.search(csrq("content", "\u0633", "\u0638", T, T, c)); + assertEquals("The index Term should be included.", 1, result.length()); + search.close(); + } } Index: src/test/org/apache/lucene/search/BaseTestRangeFilter.java =================================================================== --- src/test/org/apache/lucene/search/BaseTestRangeFilter.java (revision 653997) +++ src/test/org/apache/lucene/search/BaseTestRangeFilter.java (working copy) @@ -32,12 +32,30 @@ public static final boolean F = false; public static final boolean T = true; - RAMDirectory index = new RAMDirectory(); Random rand = new Random(101); // use a set seed to test is deterministic + + /** + * Collation interacts badly with hyphens -- collation produces different + * ordering than Unicode code-point ordering -- so two indexes are created: + * one which can't have negative random integers, for testing collated + * ranges, and the other which can have negative random integers, for all + * other tests. + */ + class TestIndex { + int maxR; + int minR; + boolean allowNegativeRandomInts; + RAMDirectory index = new RAMDirectory(); + + TestIndex(int minR, int maxR, boolean allowNegativeRandomInts) { + this.minR = minR; + this.maxR = maxR; + this.allowNegativeRandomInts = allowNegativeRandomInts; + } + } + TestIndex signedIndex = new TestIndex(Integer.MAX_VALUE, Integer.MIN_VALUE, true); + TestIndex unsignedIndex = new TestIndex(Integer.MAX_VALUE, 0, false); - int maxR = Integer.MIN_VALUE; - int minR = Integer.MAX_VALUE; - int minId = 0; int maxId = 10000; @@ -65,28 +83,31 @@ public BaseTestRangeFilter(String name) { super(name); - build(); + build(signedIndex); + build(unsignedIndex); } public BaseTestRangeFilter() { - build(); + build(signedIndex); + build(unsignedIndex); } - private void build() { + protected void build(TestIndex index) { try { /* build an index */ - IndexWriter writer = new IndexWriter(index, new SimpleAnalyzer(), T, + IndexWriter writer = new IndexWriter(index.index, new SimpleAnalyzer(), T, IndexWriter.MaxFieldLength.LIMITED); for (int d = minId; d <= maxId; d++) { Document doc = new Document(); doc.add(new Field("id",pad(d), Field.Store.YES, Field.Index.UN_TOKENIZED)); - int r= rand.nextInt(); - if (maxR < r) { - maxR = r; + int r= index.allowNegativeRandomInts + ? rand.nextInt() : rand.nextInt(Integer.MAX_VALUE); + if (index.maxR < r) { + index.maxR = r; } - if (r < minR) { - minR = r; + if (r < index.minR) { + index.minR = r; } doc.add(new Field("rand",pad(r), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.add(new Field("body","body", Field.Store.YES, Field.Index.UN_TOKENIZED)); Index: src/java/org/apache/lucene/queryParser/QueryParser.java =================================================================== --- src/java/org/apache/lucene/queryParser/QueryParser.java (revision 653997) +++ src/java/org/apache/lucene/queryParser/QueryParser.java (working copy) @@ -114,6 +114,10 @@ // maps field names to date resolutions Map fieldToDateResolution = null; + // The collator to use when determining range inclusion, + // for use when constructing RangeQuerys and ConstantScoreRangeQuerys. + Collator rangeCollator = null; + /** The default operator for parsing queries. * Use {@link QueryParser#setDefaultOperator} to change it. */ @@ -391,6 +395,34 @@ return resolution; } + /** + * Sets the collator used to determine index term inclusion in ranges + * specified either for ConstantScoreRangeQuerys or RangeQuerys (if + * {@link #setUseOldRangeQuery(boolean)} is called with a true + * value.) + *

+ * WARNING: Setting the range collator to a non-null value + * using this method will cause every single index Term in the Field + * referenced by lowerTerm and/or upperTerm to be examined. Depending on the + * number of index Terms in this Field, the operation could be very slow. + * + * @param rc the collator to use when constructing RangeQuery's + * and ConstantScoreRangeQuery's + */ + public void setRangeCollator(Collator rc) { + rangeCollator = rc; + } + + /** + * @return the collator used to determine index term inclusion in ranges + * specified either for ConstantScoreRangeQuerys or RangeQuerys (if + * {@link #setUseOldRangeQuery(boolean)} is called with a true + * value.) + */ + public Collator getRangeCollator() { + return rangeCollator; + } + protected void addClause(Vector clauses, int conj, int mods, Query q) { boolean required, prohibited; @@ -610,11 +642,12 @@ { return new RangeQuery(new Term(field, part1), new Term(field, part2), - inclusive); + inclusive, rangeCollator); } else { - return new ConstantScoreRangeQuery(field,part1,part2,inclusive,inclusive); + return new ConstantScoreRangeQuery + (field, part1, part2, inclusive, inclusive, rangeCollator); } } @@ -1535,6 +1568,7 @@ final private void jj_rescan_token() { jj_rescan = true; for (int i = 0; i < 1; i++) { + try { JJCalls p = jj_2_rtns[i]; do { if (p.gen > jj_gen) { @@ -1545,6 +1579,7 @@ } p = p.next; } while (p != null); + } catch(LookaheadSuccess ls) { } } jj_rescan = false; } Index: src/java/org/apache/lucene/queryParser/QueryParser.jj =================================================================== --- src/java/org/apache/lucene/queryParser/QueryParser.jj (revision 653997) +++ src/java/org/apache/lucene/queryParser/QueryParser.jj (working copy) @@ -138,6 +138,10 @@ // maps field names to date resolutions Map fieldToDateResolution = null; + // The collator to use when determining range inclusion, + // for use when constructing RangeQuerys and ConstantScoreRangeQuerys. + Collator rangeCollator = null; + /** The default operator for parsing queries. * Use {@link QueryParser#setDefaultOperator} to change it. */ @@ -414,6 +418,35 @@ return resolution; } + + /** + * Sets the collator used to determine index term inclusion in ranges + * specified either for ConstantScoreRangeQuerys or RangeQuerys (if + * {@link #setUseOldRangeQuery(boolean)} is called with a true + * value.) + *

+ * WARNING: Setting the rangeCollator to a non-null + * collator using this method will cause every single index Term in the + * Field referenced by lowerTerm and/or upperTerm to be examined. + * Depending on the number of index Terms in this Field, the operation could + * be very slow. + * + * @param rc the collator to use when constructing RangeQuerys + * and ConstantScoreRangeQuerys + */ + public void setRangeCollator(Collator rc) { + rangeCollator = rc; + } + + /** + * @return the collator used to determine index term inclusion in ranges + * specified either for ConstantScoreRangeQuerys or RangeQuerys (if + * {@link #setUseOldRangeQuery(boolean)} is called with a true + * value.) + */ + public Collator getRangeCollator() { + return rangeCollator; + } protected void addClause(Vector clauses, int conj, int mods, Query q) { boolean required, prohibited; @@ -634,11 +667,12 @@ { return new RangeQuery(new Term(field, part1), new Term(field, part2), - inclusive); + inclusive, rangeCollator); } else { - return new ConstantScoreRangeQuery(field,part1,part2,inclusive,inclusive); + return new ConstantScoreRangeQuery + (field, part1, part2, inclusive, inclusive, rangeCollator); } } Index: src/java/org/apache/lucene/search/RangeFilter.java =================================================================== --- src/java/org/apache/lucene/search/RangeFilter.java (revision 653997) +++ src/java/org/apache/lucene/search/RangeFilter.java (working copy) @@ -25,6 +25,7 @@ import java.io.IOException; import java.util.BitSet; +import java.text.Collator; /** * A Filter that restricts search results to a range of values in a given @@ -42,8 +43,9 @@ private String upperTerm; private boolean includeLower; private boolean includeUpper; + private Collator collator; - /** + /** * @param fieldName The field this range applies to * @param lowerTerm The lower bound on this range * @param upperTerm The upper bound on this range @@ -74,8 +76,32 @@ ("The upper bound must be non-null to be inclusive"); } } - + /** + * WARNING: Using this constructor and supplying a non-null + * value in the collator parameter will cause every single + * index Term in the Field referenced by lowerTerm and/or upperTerm to be + * examined. Depending on the number of index Terms in this Field, the + * operation could be very slow. + * + * @param lowerTerm The lower bound on this range + * @param upperTerm The upper bound on this range + * @param includeLower Does this range include the lower bound? + * @param includeUpper Does this range include the upper bound? + * @param collator The collator to use when determining range inclusion; set + * to null to use Unicode code point ordering instead of collation. + * @throws IllegalArgumentException if both terms are null or if + * lowerTerm is null and includeLower is true (similar for upperTerm + * and includeUpper) + */ + public RangeFilter(String fieldName, String lowerTerm, String upperTerm, + boolean includeLower, boolean includeUpper, + Collator collator) { + this(fieldName, lowerTerm, upperTerm, includeLower, includeUpper); + this.collator = collator; + } + + /** * Constructs a filter for field fieldName matching * less than or equal to upperTerm. */ @@ -100,52 +126,81 @@ public BitSet bits(IndexReader reader) throws IOException { BitSet bits = new BitSet(reader.maxDoc()); TermEnum enumerator = - (null != lowerTerm + (null != lowerTerm && collator == null ? reader.terms(new Term(fieldName, lowerTerm)) : reader.terms(new Term(fieldName,""))); - + try { - + if (enumerator.term() == null) { return bits; } - - boolean checkLower = false; - if (!includeLower) // make adjustments to set to exclusive - checkLower = true; + + TermDocs termDocs = reader.termDocs(); + + if (collator != null) { + try { + do { + Term term = enumerator.term(); + if (term != null && term.field().equals(fieldName)) { + if ((lowerTerm == null + || (includeLower + ? collator.compare(term.text(), lowerTerm) >= 0 + : collator.compare(term.text(), lowerTerm) > 0)) + && (upperTerm == null + || (includeUpper + ? collator.compare(term.text(), upperTerm) <= 0 + : collator.compare(term.text(), upperTerm) < 0))) { + /* we have a good term, find the docs */ + termDocs.seek(enumerator.term()); + while (termDocs.next()) { + bits.set(termDocs.doc()); + } + } + } + } + while (enumerator.next()); + } + finally { + termDocs.close(); + } + } else { // collator is null - use Unicode code point ordering + boolean checkLower = false; + if (!includeLower) // make adjustments to set to exclusive + checkLower = true; - TermDocs termDocs = reader.termDocs(); - try { + try { - do { - Term term = enumerator.term(); - if (term != null && term.field().equals(fieldName)) { - if (!checkLower || null==lowerTerm || term.text().compareTo(lowerTerm) > 0) { - checkLower = false; - if (upperTerm != null) { - int compare = upperTerm.compareTo(term.text()); - /* if beyond the upper term, or is exclusive and - * this is equal to the upper term, break out */ - if ((compare < 0) || - (!includeUpper && compare==0)) { - break; + do { + Term term = enumerator.term(); + if (term != null && term.field().equals(fieldName)) { + if (!checkLower || null==lowerTerm || term.text().compareTo(lowerTerm) > 0) { + checkLower = false; + if (upperTerm != null) { + int compare = upperTerm.compareTo(term.text()); + /* if beyond the upper term, or is exclusive and + * this is equal to the upper term, break out */ + if ((compare < 0) || + (!includeUpper && compare==0)) { + break; + } } - } - /* we have a good term, find the docs */ + /* we have a good term, find the docs */ - termDocs.seek(enumerator.term()); - while (termDocs.next()) { - bits.set(termDocs.doc()); + termDocs.seek(enumerator.term()); + while (termDocs.next()) { + bits.set(termDocs.doc()); + } } + } else { + break; } - } else { - break; } + while (enumerator.next()); + + } finally { + termDocs.close(); } - while (enumerator.next()); - - } finally { - termDocs.close(); } } finally { enumerator.close(); @@ -162,7 +217,7 @@ OpenBitSet bits = new OpenBitSet(reader.maxDoc()); TermEnum enumerator = - (null != lowerTerm + (null != lowerTerm && collator == null ? reader.terms(new Term(fieldName, lowerTerm)) : reader.terms(new Term(fieldName,""))); @@ -171,43 +226,72 @@ if (enumerator.term() == null) { return bits; } - - boolean checkLower = false; - if (!includeLower) // make adjustments to set to exclusive - checkLower = true; + + TermDocs termDocs = reader.termDocs(); + + if (collator != null) { + try { + do { + Term term = enumerator.term(); + if (term != null && term.field().equals(fieldName)) { + if ((lowerTerm == null + || (includeLower + ? collator.compare(term.text(), lowerTerm) >= 0 + : collator.compare(term.text(), lowerTerm) > 0)) + && (upperTerm == null + || (includeUpper + ? collator.compare(term.text(), upperTerm) <= 0 + : collator.compare(term.text(), upperTerm) < 0))) { + /* we have a good term, find the docs */ + termDocs.seek(enumerator.term()); + while (termDocs.next()) { + bits.set(termDocs.doc()); + } + } + } + } + while (enumerator.next()); + } + finally { + termDocs.close(); + } + } else { // collator is null - use Unicode code point ordering + boolean checkLower = false; + if (!includeLower) // make adjustments to set to exclusive + checkLower = true; - TermDocs termDocs = reader.termDocs(); - try { + try { - do { - Term term = enumerator.term(); - if (term != null && term.field().equals(fieldName)) { - if (!checkLower || null==lowerTerm || term.text().compareTo(lowerTerm) > 0) { - checkLower = false; - if (upperTerm != null) { - int compare = upperTerm.compareTo(term.text()); - /* if beyond the upper term, or is exclusive and - * this is equal to the upper term, break out */ - if ((compare < 0) || - (!includeUpper && compare==0)) { - break; + do { + Term term = enumerator.term(); + if (term != null && term.field().equals(fieldName)) { + if (!checkLower || null==lowerTerm || term.text().compareTo(lowerTerm) > 0) { + checkLower = false; + if (upperTerm != null) { + int compare = upperTerm.compareTo(term.text()); + /* if beyond the upper term, or is exclusive and + * this is equal to the upper term, break out */ + if ((compare < 0) || + (!includeUpper && compare==0)) { + break; + } } - } - /* we have a good term, find the docs */ + /* we have a good term, find the docs */ - termDocs.seek(enumerator.term()); - while (termDocs.next()) { - bits.set(termDocs.doc()); + termDocs.seek(enumerator.term()); + while (termDocs.next()) { + bits.set(termDocs.doc()); + } } + } else { + break; } - } else { - break; } + while (enumerator.next()); + + } finally { + termDocs.close(); } - while (enumerator.next()); - - } finally { - termDocs.close(); } } finally { enumerator.close(); @@ -241,6 +325,7 @@ if (!this.fieldName.equals(other.fieldName) || this.includeLower != other.includeLower || this.includeUpper != other.includeUpper + || (this.collator != null && ! this.collator.equals(other.collator)) ) { return false; } if (this.lowerTerm != null ? !this.lowerTerm.equals(other.lowerTerm) : other.lowerTerm != null) return false; if (this.upperTerm != null ? !this.upperTerm.equals(other.upperTerm) : other.upperTerm != null) return false; @@ -255,6 +340,7 @@ h ^= (upperTerm != null ? (upperTerm.hashCode()) : 0x91BEC2C2); h ^= (includeLower ? 0xD484B933 : 0) ^ (includeUpper ? 0x6AE423AC : 0); + h ^= collator != null ? collator.hashCode() : 0; return h; } } Index: src/java/org/apache/lucene/search/RangeQuery.java =================================================================== --- src/java/org/apache/lucene/search/RangeQuery.java (revision 653997) +++ src/java/org/apache/lucene/search/RangeQuery.java (working copy) @@ -18,6 +18,7 @@ */ import java.io.IOException; +import java.text.Collator; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; @@ -46,12 +47,18 @@ private Term lowerTerm; private Term upperTerm; private boolean inclusive; + private Collator collator; /** Constructs a query selecting all terms greater than * lowerTerm but less than upperTerm. * There must be at least one term and either term may be null, * in which case there is no bound on that side, but if there are * two terms, both terms must be for the same field. + * + * @param lowerTerm The Term at the lower end of the range + * @param upperTerm The Term at the upper end of the range + * @param inclusive If true, both lowerTerm and + * upperTerm will themselves be included in the range. */ public RangeQuery(Term lowerTerm, Term upperTerm, boolean inclusive) { @@ -76,48 +83,109 @@ this.inclusive = inclusive; } + /** Constructs a query selecting all terms greater than + * lowerTerm but less than upperTerm. + * There must be at least one term and either term may be null, + * in which case there is no bound on that side, but if there are + * two terms, both terms must be for the same field. + *

+ * If collator is not null, it will be used to decide whether + * index terms are within the given range, rather than using the Unicode code + * point order in which index terms are stored. + *

+ * WARNING: Using this constructor and supplying a non-null + * value in the collator parameter will cause every single + * index Term in the Field referenced by lowerTerm and/or upperTerm to be + * examined. Depending on the number of index Terms in this Field, the + * operation could be very slow. + * + * @param lowerTerm The Term at the lower end of the range + * @param upperTerm The Term at the upper end of the range + * @param inclusive If true, both lowerTerm and + * upperTerm will themselves be included in the range. + * @param collator The collator to use to collate index Terms, to determine + * their membership in the range bounded by lowerTerm and + * upperTerm. + */ + public RangeQuery(Term lowerTerm, Term upperTerm, boolean inclusive, + Collator collator) + { + this(lowerTerm, upperTerm, inclusive); + this.collator = collator; + } + public Query rewrite(IndexReader reader) throws IOException { BooleanQuery query = new BooleanQuery(true); - TermEnum enumerator = reader.terms(lowerTerm); + String testField = getField(); + if (collator != null) { + TermEnum enumerator = reader.terms(new Term(testField, "")); + String lowerTermText = lowerTerm != null ? lowerTerm.text() : null; + String upperTermText = upperTerm != null ? upperTerm.text() : null; - try { + try { + do { + Term term = enumerator.term(); + if (term != null && term.field() == testField) { // interned comparison + if ((lowerTermText == null + || (inclusive ? collator.compare(term.text(), lowerTermText) >= 0 + : collator.compare(term.text(), lowerTermText) > 0)) + && (upperTermText == null + || (inclusive ? collator.compare(term.text(), upperTermText) <= 0 + : collator.compare(term.text(), upperTermText) < 0))) { + addTermToQuery(term, query); + } + } + } + while (enumerator.next()); + } + finally { + enumerator.close(); + } + } + else { // collator is null + TermEnum enumerator = reader.terms(lowerTerm); - boolean checkLower = false; - if (!inclusive) // make adjustments to set to exclusive - checkLower = true; + try { - String testField = getField(); + boolean checkLower = false; + if (!inclusive) // make adjustments to set to exclusive + checkLower = true; - do { - Term term = enumerator.term(); - if (term != null && term.field() == testField) { // interned comparison - if (!checkLower || term.text().compareTo(lowerTerm.text()) > 0) { - checkLower = false; - if (upperTerm != null) { - int compare = upperTerm.text().compareTo(term.text()); - /* if beyond the upper term, or is exclusive and - * this is equal to the upper term, break out */ - if ((compare < 0) || (!inclusive && compare == 0)) - break; + do { + Term term = enumerator.term(); + if (term != null && term.field() == testField) { // interned comparison + if (!checkLower || term.text().compareTo(lowerTerm.text()) > 0) { + checkLower = false; + if (upperTerm != null) { + int compare = upperTerm.text().compareTo(term.text()); + /* if beyond the upper term, or is exclusive and + * this is equal to the upper term, break out */ + if ((compare < 0) || (!inclusive && compare == 0)) + break; + } + addTermToQuery(term, query); // Found a match } - TermQuery tq = new TermQuery(term); // found a match - tq.setBoost(getBoost()); // set the boost - query.add(tq, BooleanClause.Occur.SHOULD); // add to query } + else { + break; + } } - else { - break; - } + while (enumerator.next()); } - while (enumerator.next()); + finally { + enumerator.close(); + } } - finally { - enumerator.close(); - } return query; } + private void addTermToQuery(Term term, BooleanQuery query) { + TermQuery tq = new TermQuery(term); + tq.setBoost(getBoost()); // set the boost + query.add(tq, BooleanClause.Occur.SHOULD); // add to query + } + /** Returns the field name for this query */ public String getField() { return (lowerTerm != null ? lowerTerm.field() : upperTerm.field()); @@ -132,7 +200,10 @@ /** Returns true if the range query is inclusive */ public boolean isInclusive() { return inclusive; } + /** Returns the collator used to determine range inclusion, if any. */ + public Collator getCollator() { return collator; } + /** Prints a user-readable version of this query. */ public String toString(String field) { @@ -159,6 +230,9 @@ final RangeQuery other = (RangeQuery) o; if (this.getBoost() != other.getBoost()) return false; if (this.inclusive != other.inclusive) return false; + if (this.collator != null && ! this.collator.equals(other.collator)) + return false; + // one of lowerTerm and upperTerm can be null if (this.lowerTerm != null ? !this.lowerTerm.equals(other.lowerTerm) : other.lowerTerm != null) return false; if (this.upperTerm != null ? !this.upperTerm.equals(other.upperTerm) : other.upperTerm != null) return false; @@ -174,6 +248,7 @@ h ^= (h << 25) | (h >>> 8); h ^= upperTerm != null ? upperTerm.hashCode() : 0; h ^= this.inclusive ? 0x2742E74A : 0; + h ^= collator != null ? collator.hashCode() : 0; return h; } } Index: src/java/org/apache/lucene/search/ConstantScoreRangeQuery.java =================================================================== --- src/java/org/apache/lucene/search/ConstantScoreRangeQuery.java (revision 653997) +++ src/java/org/apache/lucene/search/ConstantScoreRangeQuery.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.index.IndexReader; import java.io.IOException; +import java.text.Collator; /** * A range query that returns a constant score equal to its boost for @@ -42,6 +43,7 @@ private final String upperVal; private final boolean includeLower; private final boolean includeUpper; + private Collator collator; public ConstantScoreRangeQuery(String fieldName, String lowerVal, String upperVal, boolean includeLower, boolean includeUpper) @@ -65,6 +67,14 @@ this.includeUpper = includeUpper; } + public ConstantScoreRangeQuery(String fieldName, String lowerVal, + String upperVal, boolean includeLower, + boolean includeUpper, Collator collator) + { + this(fieldName, lowerVal, upperVal, includeLower, includeUpper); + this.collator = collator; + } + /** Returns the field name for this query */ public String getField() { return fieldName; } /** Returns the value of the lower endpoint of this range query, null if open ended */ @@ -78,9 +88,10 @@ public Query rewrite(IndexReader reader) throws IOException { // Map to RangeFilter semantics which are slightly different... - RangeFilter rangeFilt = new RangeFilter(fieldName, - lowerVal!=null?lowerVal:"", - upperVal, lowerVal==""?false:includeLower, upperVal==null?false:includeUpper); + RangeFilter rangeFilt = new RangeFilter + (fieldName, lowerVal != null?lowerVal:"", upperVal, + lowerVal==""?false:includeLower, upperVal==null?false:includeUpper, + collator); Query q = new ConstantScoreQuery(rangeFilt); q.setBoost(getBoost()); return q; @@ -117,6 +128,7 @@ if (this.fieldName != other.fieldName // interned comparison || this.includeLower != other.includeLower || this.includeUpper != other.includeUpper + || (this.collator != null && ! this.collator.equals(other.collator)) ) { return false; } if (this.lowerVal != null ? !this.lowerVal.equals(other.lowerVal) : other.lowerVal != null) return false; if (this.upperVal != null ? !this.upperVal.equals(other.upperVal) : other.upperVal != null) return false; @@ -134,6 +146,7 @@ h ^= (upperVal != null ? (upperVal.hashCode()) : 0x5a695a69); h ^= (includeLower ? 0x665599aa : 0) ^ (includeUpper ? 0x99aa5566 : 0); + h ^= collator != null ? collator.hashCode() : 0; return h; } }