Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1164028) +++ lucene/CHANGES.txt (working copy) @@ -57,6 +57,9 @@ * LUCENE-3409: IndexWriter.deleteAll was failing to close pooled NRT SegmentReaders, leading to unused files accumulating in the Directory. (tal steier via Mike McCandless) + +* LUCENE-3390: Sort by Numeric values was incorrect when some documents were missing + the sorting field. (Gilad Barkai via Doron Cohen) New Features Index: lucene/src/test/org/apache/lucene/search/TestSort.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestSort.java (revision 1164028) +++ lucene/src/test/org/apache/lucene/search/TestSort.java (working copy) @@ -68,6 +68,7 @@ private Query queryE; private Query queryF; private Query queryG; + private Query queryM; private Sort sort; @BeforeClass @@ -82,23 +83,29 @@ // the string field to sort by string // the i18n field includes accented characters for testing locale-specific sorting private String[][] data = new String[][] { - // tracer contents int float string custom i18n long double, 'short', byte, 'custom parser encoding' - { "A", "x a", "5", "4f", "c", "A-3", "p\u00EAche", "10", "-4.0", "3", "126", "J"},//A, x - { "B", "y a", "5", "3.4028235E38", "i", "B-10", "HAT", "1000000000", "40.0", "24", "1", "I"},//B, y - { "C", "x a b c", "2147483647", "1.0", "j", "A-2", "p\u00E9ch\u00E9", "99999999", "40.00002343", "125", "15", "H"},//C, x - { "D", "y a b c", "-1", "0.0f", "a", "C-0", "HUT", String.valueOf(Long.MAX_VALUE), String.valueOf(Double.MIN_VALUE), String.valueOf(Short.MIN_VALUE), String.valueOf(Byte.MIN_VALUE), "G"},//D, y - { "E", "x a b c d", "5", "2f", "h", "B-8", "peach", String.valueOf(Long.MIN_VALUE), String.valueOf(Double.MAX_VALUE), String.valueOf(Short.MAX_VALUE), String.valueOf(Byte.MAX_VALUE), "F"},//E,x - { "F", "y a b c d", "2", "3.14159f", "g", "B-1", "H\u00C5T", "-44", "343.034435444", "-3", "0", "E"},//F,y - { "G", "x a b c d", "3", "-1.0", "f", "C-100", "sin", "323254543543", "4.043544", "5", "100", "D"},//G,x - { "H", "y a b c d", "0", "1.4E-45", "e", "C-88", "H\u00D8T", "1023423423005","4.043545", "10", "-50", "C"},//H,y - { "I", "x a b c d e f", "-2147483648", "1.0e+0", "d", "A-10", "s\u00EDn", "332422459999", "4.043546", "-340", "51", "B"},//I,x - { "J", "y a b c d e f", "4", ".5", "b", "C-7", "HOT", "34334543543", "4.0000220343", "300", "2", "A"},//J,y + // tracer contents int float string custom i18n long double, short, byte, 'custom parser encoding' + { "A", "x a", "5", "4f", "c", "A-3", "p\u00EAche", "10", "-4.0", "3", "126", "J"},//A, x + { "B", "y a", "5", "3.4028235E38", "i", "B-10", "HAT", "1000000000", "40.0", "24", "1", "I"},//B, y + { "C", "x a b c", "2147483647", "1.0", "j", "A-2", "p\u00E9ch\u00E9", "99999999","40.00002343", "125", "15", "H"},//C, x + { "D", "y a b c", "-1", "0.0f", "a", "C-0", "HUT", String.valueOf(Long.MAX_VALUE),String.valueOf(Double.MIN_VALUE), String.valueOf(Short.MIN_VALUE), String.valueOf(Byte.MIN_VALUE), "G"},//D, y + { "E", "x a b c d", "5", "2f", "h", "B-8", "peach", String.valueOf(Long.MIN_VALUE),String.valueOf(Double.MAX_VALUE), String.valueOf(Short.MAX_VALUE), String.valueOf(Byte.MAX_VALUE), "F"},//E,x + { "F", "y a b c d", "2", "3.14159f", "g", "B-1", "H\u00C5T", "-44", "343.034435444", "-3", "0", "E"},//F,y + { "G", "x a b c d", "3", "-1.0", "f", "C-100", "sin", "323254543543", "4.043544", "5", "100", "D"},//G,x + { "H", "y a b c d", "0", "1.4E-45", "e", "C-88", "H\u00D8T", "1023423423005","4.043545", "10", "-50", "C"},//H,y + { "I", "x a b c d e f", "-2147483648", "1.0e+0", "d", "A-10", "s\u00EDn", "332422459999", "4.043546", "-340", "51", "B"},//I,x + { "J", "y a b c d e f", "4", ".5", "b", "C-7", "HOT", "34334543543", "4.0000220343", "300", "2", "A"},//J,y { "W", "g", "1", null, null, null, null, null, null, null, null, null}, { "X", "g", "1", "0.1", null, null, null, null, null, null, null, null}, { "Y", "g", "1", "0.2", null, null, null, null, null, null, null, null}, - { "Z", "f g", null, null, null, null, null, null, null, null, null, null} - }; + { "Z", "f g", null, null, null, null, null, null, null, null, null, null}, + // Sort Missing first/last + { "a", "m", null, null, null, null, null, null, null, null, null, null}, + { "b", "m", "4", "4.0", "4", null, null, "4", "4", "4", "4", null}, + { "c", "m", "5", "5.0", "5", null, null, "5", "5", "5", "5", null}, + { "d", "m", null, null, null, null, null, null, null, null, null, null} + }; + // the sort order of Ø versus U depends on the version of the rules being used // for the inherited root locale: Ø's order isnt specified in Locale.US since // its not used in english. @@ -224,6 +231,7 @@ queryE = new TermQuery (new Term ("contents", "e")); queryF = new TermQuery (new Term ("contents", "f")); queryG = new TermQuery (new Term ("contents", "g")); + queryM = new TermQuery (new Term ("contents", "m")); sort = new Sort(); } @@ -284,6 +292,68 @@ assertMatches (full, queryY, sort, "DJHFB"); } + private static class SortMissingLastTestHelper { + final SortField sortField; + final Object min; + final Object max; + + SortMissingLastTestHelper( SortField sortField, Object min, Object max ) { + this.sortField = sortField; + this.min = min; + this.max = max; + } + } + + // test sorts where the type of field is specified + public void testSortMissingLast() throws Exception { + + @SuppressWarnings("boxing") + SortMissingLastTestHelper[] ascendTesters = new SortMissingLastTestHelper[] { + new SortMissingLastTestHelper( new SortField( "byte", SortField.BYTE ), Byte.MIN_VALUE, Byte.MAX_VALUE ), + new SortMissingLastTestHelper( new SortField( "short", SortField.SHORT ), Short.MIN_VALUE, Short.MAX_VALUE ), + new SortMissingLastTestHelper( new SortField( "int", SortField.INT ), Integer.MIN_VALUE, Integer.MAX_VALUE ), + new SortMissingLastTestHelper( new SortField( "long", SortField.LONG ), Long.MIN_VALUE, Long.MAX_VALUE ), + new SortMissingLastTestHelper( new SortField( "float", SortField.FLOAT ), Float.MIN_VALUE, Float.MAX_VALUE ), + new SortMissingLastTestHelper( new SortField( "double", SortField.DOUBLE ), Double.MIN_VALUE, Double.MAX_VALUE ), + }; + + @SuppressWarnings("boxing") + SortMissingLastTestHelper[] descendTesters = new SortMissingLastTestHelper[] { + new SortMissingLastTestHelper( new SortField( "byte", SortField.BYTE, true ), Byte.MIN_VALUE, Byte.MAX_VALUE ), + new SortMissingLastTestHelper( new SortField( "short", SortField.SHORT, true ), Short.MIN_VALUE, Short.MAX_VALUE ), + new SortMissingLastTestHelper( new SortField( "int", SortField.INT, true ), Integer.MIN_VALUE, Integer.MAX_VALUE ), + new SortMissingLastTestHelper( new SortField( "long", SortField.LONG, true ), Long.MIN_VALUE, Long.MAX_VALUE ), + new SortMissingLastTestHelper( new SortField( "float", SortField.FLOAT, true ), Float.MIN_VALUE, Float.MAX_VALUE ), + new SortMissingLastTestHelper( new SortField( "double", SortField.DOUBLE, true ), Double.MIN_VALUE, Double.MAX_VALUE ), + }; + + // Default order: ascending + for( SortMissingLastTestHelper t : ascendTesters ) { + sort.setSort (t.sortField, SortField.FIELD_DOC ); + assertMatches("sortField:"+t.sortField, full, queryM, sort, "adbc" ); + + sort.setSort (t.sortField.setMissingValue( t.max ), SortField.FIELD_DOC ); + assertMatches("sortField:"+t.sortField, full, queryM, sort, "bcad" ); + + sort.setSort (t.sortField.setMissingValue( t.min ), SortField.FIELD_DOC ); + assertMatches("sortField:"+t.sortField, full, queryM, sort, "adbc" ); + } + + // Reverse order: descending (Note: Order for un-valued documents remains the same due to tie breaker: a,d) + for( SortMissingLastTestHelper t : descendTesters ) { + sort.setSort (t.sortField, SortField.FIELD_DOC ); + assertMatches("sortField:"+t.sortField, full, queryM, sort, "cbad" ); + + sort.setSort (t.sortField.setMissingValue( t.max ), SortField.FIELD_DOC ); + assertMatches("sortField:"+t.sortField, full, queryM, sort, "adcb" ); + + sort.setSort (t.sortField.setMissingValue( t.min ), SortField.FIELD_DOC ); + assertMatches("sortField:"+t.sortField, full, queryM, sort, "cbad" ); + } + + + } + /** * Test String sorting: small queue to many matches, multi field sort, reverse sort */ @@ -1056,13 +1126,17 @@ } + private void assertMatches(Searcher searcher, Query query, Sort sort, String expectedResult) throws IOException { + assertMatches( null, searcher, query, sort, expectedResult ); + } + // make sure the documents returned by the search match the expected list - private void assertMatches(Searcher searcher, Query query, Sort sort, + private void assertMatches(String msg, Searcher searcher, Query query, Sort sort, String expectedResult) throws IOException { //ScoreDoc[] result = searcher.search (query, null, 1000, sort).scoreDocs; TopDocs hits = searcher.search (query, null, Math.max(1, expectedResult.length()), sort); ScoreDoc[] result = hits.scoreDocs; - assertEquals(hits.totalHits, expectedResult.length()); + assertEquals(expectedResult.length(),hits.totalHits); StringBuilder buff = new StringBuilder(10); int n = result.length; for (int i=0; i getScores (ScoreDoc[] hits, Searcher searcher) Index: lucene/src/java/org/apache/lucene/search/FieldCache.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FieldCache.java (revision 1164028) +++ lucene/src/java/org/apache/lucene/search/FieldCache.java (working copy) @@ -309,6 +309,14 @@ return FieldCache.class.getName()+".NUMERIC_UTILS_DOUBLE_PARSER"; } }; + + /** Checks the internal cache for an appropriate entry, and if none is found, + * reads the terms in field and returns a bit set at the size of + * reader.maxDoc(), with turned on bits for each docid that + * does not have a value for this field. + */ + public DocIdSet getUnValuedDocs (IndexReader reader, String field) + throws IOException; /** Checks the internal cache for an appropriate entry, and if none is * found, reads the terms in field as a single byte and returns an array @@ -595,5 +603,5 @@ public void setInfoStream(PrintStream stream); /** counterpart of {@link #setInfoStream(PrintStream)} */ - public PrintStream getInfoStream(); + public PrintStream getInfoStream(); } Index: lucene/src/java/org/apache/lucene/search/FieldComparator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FieldComparator.java (revision 1164028) +++ lucene/src/java/org/apache/lucene/search/FieldComparator.java (working copy) @@ -29,6 +29,7 @@ import org.apache.lucene.search.FieldCache.IntParser; import org.apache.lucene.search.FieldCache.ShortParser; import org.apache.lucene.search.FieldCache.StringIndex; +import org.apache.lucene.util.OpenBitSet; /** * Expert: a FieldComparator compares hits so as to determine their @@ -83,6 +84,14 @@ */ public abstract class FieldComparator { + protected T missingValue = null; + + /** Set a default sorting value for documents which lacks one */ + public FieldComparator setMissingValue(T missingValue) { + this.missingValue = missingValue; + return this; + } + /** * Compare hit at slot1 with hit at slot2. * @@ -205,6 +214,13 @@ @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getBytes(reader, field, parser); + if (missingValue != null) { + DocIdSetIterator iterator = FieldCache.DEFAULT.getUnValuedDocs(reader, field).iterator(); + final byte byteValue = missingValue.byteValue(); + for (int doc=iterator.nextDoc(); doc!=DocIdSetIterator.NO_MORE_DOCS; doc=iterator.nextDoc()) { + currentReaderValues[doc] = byteValue; + } + } } @Override @@ -312,6 +328,13 @@ @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getDoubles(reader, field, parser); + if (missingValue != null) { + DocIdSetIterator iterator = FieldCache.DEFAULT.getUnValuedDocs(reader, field).iterator(); + final double doubleValue = missingValue.doubleValue(); + for (int doc=iterator.nextDoc(); doc!=DocIdSetIterator.NO_MORE_DOCS; doc=iterator.nextDoc()) { + currentReaderValues[doc] = doubleValue; + } + } } @Override @@ -377,6 +400,13 @@ @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getFloats(reader, field, parser); + if (missingValue != null) { + DocIdSetIterator iterator = FieldCache.DEFAULT.getUnValuedDocs(reader, field).iterator(); + final float floatValue = missingValue.floatValue(); + for (int doc=iterator.nextDoc(); doc!=DocIdSetIterator.NO_MORE_DOCS; doc=iterator.nextDoc()) { + currentReaderValues[doc] = floatValue; + } + } } @Override @@ -446,6 +476,13 @@ @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getInts(reader, field, parser); + if (missingValue != null) { + DocIdSetIterator iterator = FieldCache.DEFAULT.getUnValuedDocs(reader, field).iterator(); + final int intValue = missingValue.intValue(); + for (int doc=iterator.nextDoc(); doc!=DocIdSetIterator.NO_MORE_DOCS; doc=iterator.nextDoc()) { + currentReaderValues[doc] = intValue; + } + } } @Override @@ -511,6 +548,13 @@ @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getLongs(reader, field, parser); + if (missingValue != null) { + DocIdSetIterator iterator = FieldCache.DEFAULT.getUnValuedDocs(reader, field).iterator(); + final long longValue = missingValue.longValue(); + for (int doc=iterator.nextDoc(); doc!=DocIdSetIterator.NO_MORE_DOCS; doc=iterator.nextDoc()) { + currentReaderValues[doc] = longValue; + } + } } @Override @@ -625,6 +669,13 @@ @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getShorts(reader, field, parser); + if (missingValue != null) { + DocIdSetIterator iterator = FieldCache.DEFAULT.getUnValuedDocs(reader, field).iterator(); + final short shortValue = missingValue.shortValue(); + for (int doc=iterator.nextDoc(); doc!=DocIdSetIterator.NO_MORE_DOCS; doc=iterator.nextDoc()) { + currentReaderValues[doc] = shortValue; + } + } } @Override Index: lucene/src/java/org/apache/lucene/search/SortField.java =================================================================== --- lucene/src/java/org/apache/lucene/search/SortField.java (revision 1164028) +++ lucene/src/java/org/apache/lucene/search/SortField.java (working copy) @@ -102,6 +102,8 @@ // Used for CUSTOM sort private FieldComparatorSource comparatorSource; + private Object missingValue; + /** Creates a sort by terms in the given field with the type of term * values explicitly given. * @param field Name of field to sort by. Can be null if @@ -204,6 +206,16 @@ this.comparatorSource = comparator; } + /** Set a default sorting value for documents which lacks one */ + public SortField setMissingValue(Object missingValue) { + if (type != BYTE && type != SHORT && type != INT && type != FLOAT && type != LONG && type != DOUBLE) { + throw new IllegalArgumentException( "Missing value only works for numeric types" ); + } + this.missingValue = missingValue; + + return this; + } + // Sets field & type, and ensures field is not NULL unless // type is SCORE or DOC private void initFieldType(String field, int type) { @@ -391,22 +403,22 @@ return new FieldComparator.DocComparator(numHits); case SortField.INT: - return new FieldComparator.IntComparator(numHits, field, parser); + return new FieldComparator.IntComparator(numHits, field, parser).setMissingValue((Integer) missingValue); case SortField.FLOAT: - return new FieldComparator.FloatComparator(numHits, field, parser); + return new FieldComparator.FloatComparator(numHits, field, parser).setMissingValue((Float) missingValue); case SortField.LONG: - return new FieldComparator.LongComparator(numHits, field, parser); + return new FieldComparator.LongComparator(numHits, field, parser).setMissingValue((Long) missingValue); case SortField.DOUBLE: - return new FieldComparator.DoubleComparator(numHits, field, parser); + return new FieldComparator.DoubleComparator(numHits, field, parser).setMissingValue((Double) missingValue); case SortField.BYTE: - return new FieldComparator.ByteComparator(numHits, field, parser); + return new FieldComparator.ByteComparator(numHits, field, parser).setMissingValue((Byte) missingValue); case SortField.SHORT: - return new FieldComparator.ShortComparator(numHits, field, parser); + return new FieldComparator.ShortComparator(numHits, field, parser).setMissingValue((Short) missingValue); case SortField.CUSTOM: assert comparatorSource != null; Index: lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (revision 1164028) +++ lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (working copy) @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -29,6 +30,9 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; +import org.apache.lucene.util.BitVector; +import org.apache.lucene.util.DocIdBitSet; +import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.FieldCacheSanityChecker; @@ -47,7 +51,7 @@ init(); } private synchronized void init() { - caches = new HashMap,Cache>(7); + caches = new HashMap,Cache>(9); caches.put(Byte.TYPE, new ByteCache(this)); caches.put(Short.TYPE, new ShortCache(this)); caches.put(Integer.TYPE, new IntCache(this)); @@ -56,6 +60,7 @@ caches.put(Double.TYPE, new DoubleCache(this)); caches.put(String.class, new StringCache(this)); caches.put(StringIndex.class, new StringIndexCache(this)); + caches.put(UnValuedDocsCache.class, new UnValuedDocsCache(this)); } public synchronized void purgeAllCaches() { @@ -409,6 +414,42 @@ } } + static final class UnValuedDocsCache extends Cache { + UnValuedDocsCache(FieldCache wrapper) { + super(wrapper); + } + + @Override + protected Object createValue(IndexReader reader, Entry entryKey) + throws IOException { + Entry entry = entryKey; + String field = entry.field; + + if (reader.maxDoc() == reader.docFreq(new Term(field))) { + return DocIdSet.EMPTY_DOCIDSET; + } + + OpenBitSet res = new OpenBitSet(reader.maxDoc()); + TermDocs termDocs = reader.termDocs(); + TermEnum termEnum = reader.terms (new Term (field)); + try { + do { + Term term = termEnum.term(); + if (term==null || term.field() != field) break; + termDocs.seek (termEnum); + while (termDocs.next()) { + res.fastSet(termDocs.doc()); + } + } while (termEnum.next()); + } finally { + termDocs.close(); + termEnum.close(); + } + res.flip(0, reader.maxDoc()); + return res; + } + } + // inherit javadocs public float[] getFloats (IndexReader reader, String field) @@ -471,7 +512,7 @@ public long[] getLongs(IndexReader reader, String field) throws IOException { return getLongs(reader, field, null); } - + // inherit javadocs public long[] getLongs(IndexReader reader, String field, FieldCache.LongParser parser) throws IOException { @@ -695,5 +736,10 @@ public PrintStream getInfoStream() { return infoStream; } + + public DocIdSet getUnValuedDocs(IndexReader reader, String field) + throws IOException { + return (DocIdSet) caches.get(UnValuedDocsCache.class).get(reader, new Entry(field, null)); + } }