Index: lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java =================================================================== --- lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java (revision 1558020) +++ lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java (working copy) @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Random; @@ -29,6 +30,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.StoredField; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; @@ -60,30 +62,44 @@ final List docValues = new ArrayList(); // TODO: deletions while (numDocs < NUM_DOCS) { - final String s; - if (random.nextBoolean()) { - s = _TestUtil.randomSimpleString(random, maxLength); - } else { - s = _TestUtil.randomUnicodeString(random, maxLength); - } - final BytesRef br = new BytesRef(s); + final Document doc = new Document(); - if (!allowDups) { - if (seen.contains(s)) { - continue; + // 10% of the time, the document is missing the value: + final BytesRef br; + if (random().nextInt(10) != 7) { + final String s; + if (random.nextBoolean()) { + s = _TestUtil.randomSimpleString(random, maxLength); + } else { + s = _TestUtil.randomUnicodeString(random, maxLength); } - seen.add(s); + + if (!allowDups) { + if (seen.contains(s)) { + continue; + } + seen.add(s); + } + + if (VERBOSE) { + System.out.println(" " + numDocs + ": s=" + s); + } + + br = new BytesRef(s); + doc.add(new SortedDocValuesField("stringdv", br)); + doc.add(newStringField("string", s, Field.Store.NO)); + docValues.add(br); + + } else { + br = null; + if (VERBOSE) { + System.out.println(" " + numDocs + ": "); + } + docValues.add(null); } - if (VERBOSE) { - System.out.println(" " + numDocs + ": s=" + s); - } - - final Document doc = new Document(); - doc.add(new SortedDocValuesField("stringdv", br)); - doc.add(newStringField("string", s, Field.Store.NO)); doc.add(new NumericDocValuesField("id", numDocs)); - docValues.add(br); + doc.add(new StoredField("id", numDocs)); writer.addDocument(doc); numDocs++; @@ -103,13 +119,26 @@ final int ITERS = atLeast(100); for(int iter=0;iter" : br.utf8ToString())); if (idx == hitCount-1) { break; } @@ -161,12 +217,30 @@ System.out.println(" actual:"); for(int hitIDX=0;hitIDX" : br.utf8ToString()) + " id=" + s.doc(fd.doc).get("id")); } } for(int hitIDX=0;hitIDX * WARNING: This is not a valid UTF8 Term **/ Index: lucene/core/src/java/org/apache/lucene/search/SortField.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/SortField.java (revision 1558020) +++ lucene/core/src/java/org/apache/lucene/search/SortField.java (working copy) @@ -106,6 +106,9 @@ // Used for 'sortMissingFirst/Last' public Object missingValue = null; + // Only used with type=STRING + public boolean sortMissingLast; + /** Creates a sort by terms in the given field with the type of term * values explicitly given. * @param field Name of field to sort by. Can be null if @@ -165,13 +168,24 @@ this.reverse = reverse; this.parser = parser; } + + /** Pass this to {@link #setMissingValue} to have missing + * string values sort first. */ + public final static Object STRING_FIRST = new Object(); - public SortField setMissingValue(Object missingValue) { - if (type != Type.INT && type != Type.FLOAT && type != Type.LONG && type != Type.DOUBLE) { - throw new IllegalArgumentException( "Missing value only works for numeric types" ); + /** Pass this to {@link #setMissingValue} to have missing + * string values sort last. */ + public final static Object STRING_LAST = new Object(); + + public void setMissingValue(Object missingValue) { + if (type == Type.STRING) { + if (missingValue != STRING_FIRST && missingValue != STRING_LAST) { + throw new IllegalArgumentException("For STRING type, missing value must be either STRING_FIRST or STRING_LAST"); + } + } else if (type != Type.INT && type != Type.FLOAT && type != Type.LONG && type != Type.DOUBLE) { + throw new IllegalArgumentException("Missing value only works for numeric or STRING types"); } this.missingValue = missingValue; - return this; } /** Creates a sort with a custom comparison function. @@ -376,9 +390,10 @@ return comparatorSource.newComparator(field, numHits, sortPos, reverse); case STRING: - return new FieldComparator.TermOrdValComparator(numHits, field); + return new FieldComparator.TermOrdValComparator(numHits, field, missingValue == STRING_LAST); case STRING_VAL: + // TODO: should we remove this? who really uses it? return new FieldComparator.TermValComparator(numHits, field); case REWRITEABLE: Index: lucene/core/src/java/org/apache/lucene/search/FieldComparator.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/FieldComparator.java (revision 1558020) +++ lucene/core/src/java/org/apache/lucene/search/FieldComparator.java (working copy) @@ -702,11 +702,33 @@ final BytesRef tempBR = new BytesRef(); + /** -1 if missing values are sorted first, 1 if they are + * sorted last */ + final int missingSortCmp; + + /** Which ordinal to use for a missing value. */ + final int missingOrd; + + /** Creates this, sorting missing values first. */ public TermOrdValComparator(int numHits, String field) { + this(numHits, field, false); + } + + /** Creates this, with control over how missing values + * are sorted. Pass sortMissingLast=true to put + * missing values at the end. */ + public TermOrdValComparator(int numHits, String field, boolean sortMissingLast) { ords = new int[numHits]; values = new BytesRef[numHits]; readerGen = new int[numHits]; this.field = field; + if (sortMissingLast) { + missingSortCmp = 1; + missingOrd = Integer.MAX_VALUE; + } else { + missingSortCmp = -1; + missingOrd = -1; + } } @Override @@ -721,9 +743,9 @@ if (val2 == null) { return 0; } - return -1; + return missingSortCmp; } else if (val2 == null) { - return 1; + return -missingSortCmp; } return val1.compareTo(val2); } @@ -737,20 +759,48 @@ public void copy(int slot, int doc) { throw new UnsupportedOperationException(); } + + // this stuff caches the ordinal of the last 'value' from compareDocToValue + BytesRef lastValue = new BytesRef(); // sentinel + int lastOrd; + int lastReaderGen = -1; + boolean lastSameReader = false; @Override public int compareDocToValue(int doc, BytesRef value) { - int ord = termsIndex.getOrd(doc); - if (ord == -1) { + if (value != lastValue || currentReaderGen != lastReaderGen) { + // cache miss (typically once per segment) + // nocommit: move this out to another method? + lastReaderGen = currentReaderGen; + lastValue = value; if (value == null) { - return 0; + // -1 ord is null + lastOrd = missingOrd; + lastSameReader = true; + } else { + final int index = termsIndex.lookupTerm(value); + if (index < 0) { + lastOrd = -index - 2; + lastSameReader = false; + } else { + lastOrd = index; + lastSameReader = true; + } } + } + + int ord = termsIndex.getOrd(doc); + + if (lastSameReader) { + // ord is precisely comparable, even in the equal case + return ord - lastOrd; + } else if (ord <= lastOrd) { + // the equals case always means doc is < value + // (because we set lastOrd to the lower bound) return -1; - } else if (value == null) { + } else { return 1; } - termsIndex.lookupOrd(ord, tempBR); - return tempBR.compareTo(value); } /** Base class for specialized (per bit width of the @@ -787,9 +837,9 @@ if (val2 == null) { return 0; } - return -1; + return missingSortCmp; } else if (val2 == null) { - return 1; + return -missingSortCmp; } return val1.compareTo(val2); } @@ -813,7 +863,10 @@ @Override public int compareBottom(int doc) { assert bottomSlot != -1; - final int docOrd = termsIndex.getOrd(doc); + int docOrd = termsIndex.getOrd(doc); + if (docOrd == -1) { + docOrd = missingOrd; + } if (bottomSameReader) { // ord is precisely comparable, even in the equal case return bottomOrd - docOrd; @@ -829,9 +882,9 @@ @Override public void copy(int slot, int doc) { - final int ord = termsIndex.getOrd(doc); - ords[slot] = ord; + int ord = termsIndex.getOrd(doc); if (ord == -1) { + ord = missingOrd; values[slot] = null; } else { assert ord >= 0; @@ -840,6 +893,7 @@ } termsIndex.lookupOrd(ord, values[slot]); } + ords[slot] = ord; readerGen[slot] = currentReaderGen; } } @@ -867,9 +921,9 @@ bottomSameReader = true; } else { if (bottomValue == null) { - // -1 ord is null for all segments - assert ords[bottomSlot] == -1; - bottomOrd = -1; + // missingOrd is null for all segments + assert ords[bottomSlot] == missingOrd; + bottomOrd = missingOrd; bottomSameReader = true; readerGen[bottomSlot] = currentReaderGen; } else { @@ -894,15 +948,16 @@ } } - // just used internally in this comparator - private static final byte[] MISSING_BYTES = new byte[0]; - /** Sorts by field's natural Term sort order. All * comparisons are done using BytesRef.compareTo, which is * slow for medium to large result sets but possibly * very fast for very small results sets. */ + // TODO: should we remove this? who really uses it? public static final class TermValComparator extends FieldComparator { + // sentinel, just used internally in this comparator + private static final byte[] MISSING_BYTES = new byte[0]; + private BytesRef[] values; private BinaryDocValues docTerms; private Bits docsWithField; @@ -910,6 +965,9 @@ private BytesRef bottom; private final BytesRef tempBR = new BytesRef(); + // nocommit add missing last support here? + + /** Sole constructor. */ TermValComparator(int numHits, String field) { values = new BytesRef[numHits]; this.field = field; Index: solr/core/src/java/org/apache/solr/search/MissingStringLastComparatorSource.java =================================================================== --- solr/core/src/java/org/apache/solr/search/MissingStringLastComparatorSource.java (revision 1558020) +++ solr/core/src/java/org/apache/solr/search/MissingStringLastComparatorSource.java (working copy) @@ -223,19 +223,48 @@ return values==null ? parent.NULL_VAL : values[slot]; } + // this stuff caches the ordinal of the last 'value' from compareDocToValue + BytesRef lastValue = new BytesRef(); // sentinel + int lastOrd; + int lastReaderGen = -1; + boolean lastSameReader = false; + @Override public int compareDocToValue(int doc, BytesRef value) { - int docOrd = termsIndex.getOrd(doc); - if (docOrd == -1) { + if (value != lastValue || currentReaderGen != lastReaderGen) { + // cache miss (typically once per segment) + // nocommit: move this out to another method? + lastReaderGen = currentReaderGen; + lastValue = value; if (value == null) { - return 0; + // -1 ord is null + lastOrd = NULL_ORD; + lastSameReader = true; + } else { + final int index = termsIndex.lookupTerm(value); + if (index < 0) { + lastOrd = -index - 2; + lastSameReader = false; + } else { + lastOrd = index; + lastSameReader = true; + } } + } + + int ord = termsIndex.getOrd(doc); + if (ord == -1) ord = NULL_ORD; + + if (lastSameReader) { + // ord is precisely comparable, even in the equal case + return ord - lastOrd; + } else if (ord <= lastOrd) { + // the equals case always means doc is < value + // (because we set lastOrd to the lower bound) + return -1; + } else { return 1; - } else if (value == null) { - return -1; } - termsIndex.lookupOrd(docOrd, tempBR); - return tempBR.compareTo(value); } } @@ -247,34 +276,18 @@ @Override public int compareBottom(int doc) { assert bottomSlot != -1; - int order = termsIndex.getOrd(doc); - if (order == -1) order = NULL_ORD; + int docOrd = termsIndex.getOrd(doc); + if (docOrd == -1) docOrd = NULL_ORD; if (bottomSameReader) { - // ord is precisely comparable, even in the equal - // case - return bottomOrd - order; + // ord is precisely comparable, even in the equal case + return bottomOrd - docOrd; + } else if (bottomOrd >= docOrd) { + // the equals case always means bottom is > doc + // (because we set bottomOrd to the lower bound in + // setBottom): + return 1; } else { - // ord is only approx comparable: if they are not - // equal, we can use that; if they are equal, we - // must fallback to compare by value - - final int cmp = bottomOrd - order; - if (cmp != 0) { - return cmp; - } - - // take care of the case where both vals are null - if (order == NULL_ORD) { - return 0; - } - - // and at this point we know that neither value is null, so safe to compare - if (order == NULL_ORD) { - return bottomValue.compareTo(parent.NULL_VAL); - } else { - termsIndex.lookupOrd(order, tempBR); - return bottomValue.compareTo(tempBR); - } + return -1; } } Index: solr/core/src/java/org/apache/solr/schema/TrieField.java =================================================================== --- solr/core/src/java/org/apache/solr/schema/TrieField.java (revision 1558020) +++ solr/core/src/java/org/apache/solr/schema/TrieField.java (working copy) @@ -142,7 +142,9 @@ Object missingValue = null; boolean sortMissingLast = field.sortMissingLast(); boolean sortMissingFirst = field.sortMissingFirst(); - + + SortField sf; + switch (type) { case INTEGER: if( sortMissingLast ) { @@ -151,7 +153,9 @@ else if( sortMissingFirst ) { missingValue = top ? Integer.MAX_VALUE : Integer.MIN_VALUE; } - return new SortField( field.getName(), FieldCache.NUMERIC_UTILS_INT_PARSER, top).setMissingValue(missingValue); + sf = new SortField( field.getName(), FieldCache.NUMERIC_UTILS_INT_PARSER, top); + sf.setMissingValue(missingValue); + return sf; case FLOAT: if( sortMissingLast ) { @@ -160,7 +164,9 @@ else if( sortMissingFirst ) { missingValue = top ? Float.POSITIVE_INFINITY : Float.NEGATIVE_INFINITY; } - return new SortField( field.getName(), FieldCache.NUMERIC_UTILS_FLOAT_PARSER, top).setMissingValue(missingValue); + sf = new SortField( field.getName(), FieldCache.NUMERIC_UTILS_FLOAT_PARSER, top); + sf.setMissingValue(missingValue); + return sf; case DATE: // fallthrough case LONG: @@ -170,7 +176,9 @@ else if( sortMissingFirst ) { missingValue = top ? Long.MAX_VALUE : Long.MIN_VALUE; } - return new SortField( field.getName(), FieldCache.NUMERIC_UTILS_LONG_PARSER, top).setMissingValue(missingValue); + sf = new SortField( field.getName(), FieldCache.NUMERIC_UTILS_LONG_PARSER, top); + sf.setMissingValue(missingValue); + return sf; case DOUBLE: if( sortMissingLast ) { @@ -179,7 +187,9 @@ else if( sortMissingFirst ) { missingValue = top ? Double.POSITIVE_INFINITY : Double.NEGATIVE_INFINITY; } - return new SortField( field.getName(), FieldCache.NUMERIC_UTILS_DOUBLE_PARSER, top).setMissingValue(missingValue); + sf = new SortField( field.getName(), FieldCache.NUMERIC_UTILS_DOUBLE_PARSER, top); + sf.setMissingValue(missingValue); + return sf; default: throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + field.name); Index: solr/core/src/java/org/apache/solr/schema/EnumField.java =================================================================== --- solr/core/src/java/org/apache/solr/schema/EnumField.java (revision 1558020) +++ solr/core/src/java/org/apache/solr/schema/EnumField.java (working copy) @@ -178,7 +178,9 @@ public SortField getSortField(SchemaField field, boolean top) { field.checkSortability(); final Object missingValue = Integer.MIN_VALUE; - return new SortField(field.getName(), FieldCache.NUMERIC_UTILS_INT_PARSER, top).setMissingValue(missingValue); + SortField sf = new SortField(field.getName(), FieldCache.NUMERIC_UTILS_INT_PARSER, top); + sf.setMissingValue(missingValue); + return sf; } /**