Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1454796) +++ lucene/CHANGES.txt (working copy) @@ -31,6 +31,11 @@ * LUCENE-4815: DrillSideways now allows more than one FacetRequest per dimension (Mike McCandless) + +* LUCENE-3918: IndexSorter has been ported to 4.3 API and now supports + sorting documents by a numeric DocValues field, or reverse the order of + the documents in the index. Additionally, apps can implement their own + sort criteria. (Anat Hashavit, Shai Erera) * LUCENE-4817: Added KeywordRepeatFilter that allows to emit a token twice once as a keyword and once as an ordinary token allow stemmers to emit Index: lucene/misc/src/java/org/apache/lucene/index/sorter/NumericDocValuesSorter.java =================================================================== --- lucene/misc/src/java/org/apache/lucene/index/sorter/NumericDocValuesSorter.java (revision 0) +++ lucene/misc/src/java/org/apache/lucene/index/sorter/NumericDocValuesSorter.java (working copy) @@ -0,0 +1,65 @@ +package org.apache.lucene.index.sorter; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.AbstractList; +import java.util.List; + +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.NumericDocValues; + +/** + * A {@link Sorter} which sorts documents according to their + * {@link NumericDocValues}. + * + * @lucene.experimental + */ +public class NumericDocValuesSorter extends Sorter { + + private final String fieldName; + + public NumericDocValuesSorter(final String fieldName) { + this.fieldName = fieldName; + } + + @Override + public int[] oldToNew(final AtomicReader reader) throws IOException { + final NumericDocValues ndv = reader.getNumericDocValues(fieldName); + final int maxDoc = reader.maxDoc(); + final int[] docs = new int[maxDoc]; + final List values = new AbstractList() { + + @Override + public Long get(int doc) { + return ndv.get(doc); + } + + @Override + public int size() { + return reader.maxDoc(); + } + + }; + for (int i = 0; i < maxDoc; i++) { + docs[i] = i; + } + return compute(docs, values); + } + +} Property changes on: lucene/misc/src/java/org/apache/lucene/index/sorter/NumericDocValuesSorter.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java =================================================================== --- lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java (revision 0) +++ lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java (working copy) @@ -0,0 +1,106 @@ +package org.apache.lucene.index.sorter; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.util.SorterTemplate; + +/** + * Sorts documents in a given index by returning a permutation on the docs. + * Implementations can call {@link #compute(int[], List)} to compute the + * old-to-new permutation over the given documents and values. + * + * @lucene.experimental + */ +public abstract class Sorter { + + /** Sorts documents in reverse order. */ + public static final Sorter REVERSE_DOCS = new Sorter() { + @Override + public int[] oldToNew(final AtomicReader reader) throws IOException { + final int maxDoc = reader.maxDoc(); + int[] reverseDocs = new int[maxDoc]; + for (int i = 0; i < maxDoc; i++) { + reverseDocs[i] = maxDoc - (i + 1); + } + return reverseDocs; + } + }; + + private static final class DocValueSorterTemplate> extends SorterTemplate { + + private final int[] docs; + private final List values; + + private T pivot; + + public DocValueSorterTemplate(int[] docs, List values) { + this.docs = docs; + this.values = values; + } + + @Override + protected int compare(int i, int j) { + return values.get(docs[i]).compareTo(values.get(docs[j])); + } + + @Override + protected int comparePivot(int j) { + return pivot.compareTo(values.get(docs[j])); + } + + @Override + protected void setPivot(int i) { + pivot = values.get(docs[i]); + } + + @Override + protected void swap(int i, int j) { + int tmpDoc = docs[i]; + docs[i] = docs[j]; + docs[j] = tmpDoc; + } + } + + /** Computes the old-to-new permutation over the given documents and values. */ + protected static > int[] compute(int[] docs, List values) { + SorterTemplate sorter = new DocValueSorterTemplate(docs, values); + sorter.quickSort(0, docs.length - 1); + + final int[] oldToNew = new int[docs.length]; + for (int i = 0; i < docs.length; i++) { + oldToNew[docs[i]] = i; + } + return oldToNew; + } + + /** + * Returns a mapping from the old document ID to its new location in the + * sorted index. Implementations can use the auxiliary + * {@link #compute(int[], List)} to compute the old-to-new permutation + * given an array of documents and their corresponding values. + *

+ * NOTE: deleted documents are expected to appear in the mapping as + * well, they will however be dropped when the index is actually sorted. + */ + public abstract int[] oldToNew(AtomicReader reader) throws IOException; + +} Property changes on: lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java =================================================================== --- lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java (revision 0) +++ lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java (working copy) @@ -0,0 +1,648 @@ +package org.apache.lucene.index.sorter; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FilterAtomicReader; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMFile; +import org.apache.lucene.store.RAMInputStream; +import org.apache.lucene.store.RAMOutputStream; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.SorterTemplate; + +/** + * An {@link AtomicReader} which supports sorting documents by a given + * {@link Sorter}. You can use this class to sort an index as follows: + * + *

+ * IndexWriter writer; // writer to which the sorted index will be added
+ * DirectoryReader reader; // reader on the input index
+ * Sorter sorter; // determines how the documents are sorted
+ * AtomicReader sortingReader = new SortingAtomicReader(reader, sorter);
+ * writer.addIndexes(reader);
+ * writer.close();
+ * reader.close();
+ * 
+ * + * @lucene.experimental + */ +public class SortingAtomicReader extends FilterAtomicReader { + + private static class SortingFields extends FilterFields { + + private final int[] old2new; + private final Bits inLiveDocs; + private final FieldInfos infos; + + public SortingFields(final Fields in, final Bits inLiveDocs, FieldInfos infos, final int[] old2new) { + super(in); + this.old2new = old2new; + this.inLiveDocs = inLiveDocs; + this.infos = infos; + } + + @Override + public Terms terms(final String field) throws IOException { + Terms terms = in.terms(field); + if (terms == null) { + return null; + } else { + return new SortingTerms(terms, inLiveDocs, infos.fieldInfo(field).getIndexOptions(), old2new); + } + } + + } + + private static class SortingTerms extends FilterTerms { + + private final int[] old2new; + private final Bits inLiveDocs; + private final IndexOptions indexOptions; + + public SortingTerms(final Terms in, final Bits inLiveDocs, IndexOptions indexOptions, final int[] old2new) { + super(in); + this.old2new = old2new; + this.inLiveDocs = inLiveDocs; + this.indexOptions = indexOptions; + } + + @Override + public TermsEnum iterator(final TermsEnum reuse) throws IOException { + return new SortingTermsEnum(in.iterator(reuse), inLiveDocs, old2new, indexOptions); + } + + } + + private static class SortingTermsEnum extends FilterTermsEnum { + + private final int[] old2new; + private final Bits inLiveDocs; + private final IndexOptions indexOptions; + + public SortingTermsEnum(final TermsEnum in, final Bits inLiveDocs, final int[] old2new, IndexOptions indexOptions) { + super(in); + this.old2new = old2new; + this.inLiveDocs = inLiveDocs; + this.indexOptions = indexOptions; + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, final int flags) throws IOException { + if (liveDocs != null) { + liveDocs = inLiveDocs; + } + + // if we're asked to reuse the given DocsEnum and it is Sorting, return + // the wrapped one, since some Codecs expect it. + if (reuse != null && reuse instanceof SortingDocsEnum) { + reuse = ((SortingDocsEnum) reuse).getWrapped(); + } + boolean withFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >=0 && (flags & DocsEnum.FLAG_FREQS) != 0; + return new SortingDocsEnum(in.docs(liveDocs, reuse, flags), withFreqs, old2new); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, final int flags) throws IOException { + if (liveDocs != null) { + liveDocs = inLiveDocs; + } + + // if we're asked to reuse the given DocsAndPositionsEnum and it is + // Sorting, return the wrapped one, since some Codecs expect it. + if (reuse != null && reuse instanceof SortingDocsAndPositionsEnum) { + reuse = ((SortingDocsAndPositionsEnum) reuse).getWrapped(); + } + + final DocsAndPositionsEnum positions = in.docsAndPositions(liveDocs, reuse, flags); + if (positions == null) { + return null; + } else { + boolean storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + return new SortingDocsAndPositionsEnum(positions, old2new, storeOffsets); + } + } + + } + + private static class SortingBinaryDocValues extends BinaryDocValues { + + private final BinaryDocValues in; + private final int[] new2old; + + SortingBinaryDocValues(BinaryDocValues in, int[] new2old) { + this.in = in; + this.new2old = new2old; + } + + @Override + public void get(int docID, BytesRef result) { + in.get(new2old[docID], result); + } + } + + private static class SortingNumericDocValues extends NumericDocValues { + + private final NumericDocValues in; + private final int[] new2old; + + public SortingNumericDocValues(final NumericDocValues in, final int[] new2old) { + this.in = in; + this.new2old = new2old; + } + + @Override + public long get(int docID) { + return in.get(new2old[docID]); + } + } + + private static class SortingSortedDocValues extends SortedDocValues { + + private final SortedDocValues in; + private final int[] new2old; + + SortingSortedDocValues(SortedDocValues in, int[] new2old) { + this.in = in; + this.new2old = new2old; + } + + @Override + public int getOrd(int docID) { + return in.getOrd(new2old[docID]); + } + + @Override + public void lookupOrd(int ord, BytesRef result) { + in.lookupOrd(ord, result); + } + + @Override + public int getValueCount() { + return in.getValueCount(); + } + + @Override + public void get(int docID, BytesRef result) { + in.get(new2old[docID], result); + } + + @Override + public int lookupTerm(BytesRef key) { + return in.lookupTerm(key); + } + } + + private static class SortingSortedSetDocValues extends SortedSetDocValues { + + private final SortedSetDocValues in; + private final int[] new2old; + + SortingSortedSetDocValues(SortedSetDocValues in, int[] new2old) { + this.in = in; + this.new2old = new2old; + } + + @Override + public long nextOrd() { + return in.nextOrd(); + } + + @Override + public void setDocument(int docID) { + in.setDocument(new2old[docID]); + } + + @Override + public void lookupOrd(long ord, BytesRef result) { + in.lookupOrd(ord, result); + } + + @Override + public long getValueCount() { + return in.getValueCount(); + } + + @Override + public long lookupTerm(BytesRef key) { + return in.lookupTerm(key); + } + } + + private static class SortingDocsEnum extends FilterDocsEnum { + + private static final class DocFreqSorterTemplate extends SorterTemplate { + + private final int[] docs; + private final int[] freqs; + + private int pivot; + + public DocFreqSorterTemplate(int[] docs, int[] freqs) { + this.docs = docs; + this.freqs = freqs; + } + + @Override + protected int compare(int i, int j) { + return docs[i] - docs[j]; + } + + @Override + protected int comparePivot(int j) { + return pivot - docs[j]; + } + + @Override + protected void setPivot(int i) { + pivot = docs[i]; + } + + @Override + protected void swap(int i, int j) { + int tmpDoc = docs[i]; + docs[i] = docs[j]; + docs[j] = tmpDoc; + + int tmpFreq = freqs[i]; + freqs[i] = freqs[j]; + freqs[j] = tmpFreq; + } + } + + private int[] docs = new int[64]; + private int[] freqs; + private int docIt = -1; + private final int upto; + private final boolean withFreqs; + + public SortingDocsEnum(final DocsEnum in, boolean withFreqs, final int[] old2new) throws IOException { + super(in); + this.withFreqs = withFreqs; + int i = 0; + int doc; + if (withFreqs) { + freqs = new int[docs.length]; + while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){ + if (i >= docs.length) { + docs = ArrayUtil.grow(docs, docs.length + 1); + freqs = ArrayUtil.grow(freqs, freqs.length + 1); + } + docs[i] = old2new[doc]; + freqs[i] = in.freq(); + ++i; + } + SorterTemplate sorter = new DocFreqSorterTemplate(docs, freqs); + sorter.quickSort(0, i - 1); + } else { + freqs = null; + while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){ + if (i >= docs.length) { + docs = ArrayUtil.grow(docs, docs.length + 1); + } + docs[i++] = old2new[doc]; + } + Arrays.sort(docs, 0, i); + } + upto = i; + } + + @Override + public int advance(final int target) throws IOException { + // need to support it for checkIndex, but in practice it won't be called, so + // don't bother to implement efficiently for now. + while (nextDoc() < target) {} + return docID(); + } + + @Override + public int docID() { + return docIt >= upto ? NO_MORE_DOCS : docs[docIt]; + } + + @Override + public int freq() throws IOException { + return withFreqs && docIt < upto ? freqs[docIt] : 1; + } + + @Override + public int nextDoc() throws IOException { + if (++docIt >= upto) return NO_MORE_DOCS; + return docs[docIt]; + } + + /** Returns the wrapped {@link DocsEnum}. */ + DocsEnum getWrapped() { + return in; + } + } + + private static class SortingDocsAndPositionsEnum extends FilterDocsAndPositionsEnum { + + /** + * A {@link SorterTemplate} which sorts two parallel arrays of doc IDs and + * offsets in one go. Everytime a doc ID is 'swapped', its correponding offset + * is swapped too. + */ + private static final class DocOffsetSorterTemplate extends SorterTemplate { + + private final int[] docs; + private final long[] offsets; + + private int pivot; + + public DocOffsetSorterTemplate(int[] docs, long[] offsets) { + this.docs = docs; + this.offsets = offsets; + } + + @Override + protected int compare(int i, int j) { + return docs[i] - docs[j]; + } + + @Override + protected int comparePivot(int j) { + return pivot - docs[j]; + } + + @Override + protected void setPivot(int i) { + pivot = docs[i]; + } + + @Override + protected void swap(int i, int j) { + int tmpDoc = docs[i]; + docs[i] = docs[j]; + docs[j] = tmpDoc; + + long tmpOffset = offsets[i]; + offsets[i] = offsets[j]; + offsets[j] = tmpOffset; + } + } + + private int[] docs; + private long[] offsets; + private final int upto; + + private final IndexInput postingInput; + private final boolean storeOffsets; + + private int docIt = -1; + private int pos; + private int startOffset = -1; + private int endOffset = -1; + private final BytesRef payload = new BytesRef(32); + private int currFreq; + + public SortingDocsAndPositionsEnum(final DocsAndPositionsEnum in, final int[] old2new, boolean storeOffsets) throws IOException { + super(in); + this.storeOffsets = storeOffsets; + final RAMFile file = new RAMFile(); + final IndexOutput out = new RAMOutputStream(file); + docs = new int[32]; + offsets = new long[32]; + int doc; + int i = 0; + while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + if (i == docs.length) { + docs = ArrayUtil.grow(docs, i + 1); + offsets = ArrayUtil.grow(offsets, i + 1); + } + docs[i] = old2new[doc]; + offsets[i] = out.getFilePointer(); + addPositions(in, out); + i++; + } + upto = i; + SorterTemplate sorter = new DocOffsetSorterTemplate(docs, offsets); + sorter.quickSort(0, upto - 1); + out.close(); + this.postingInput = new RAMInputStream("", file); + } + + private void addPositions(final DocsAndPositionsEnum in, final IndexOutput out) throws IOException { + int freq = in.freq(); + out.writeVInt(freq); + for (int i = 0; i < freq; i++) { + final int pos = in.nextPosition(); + out.writeVInt(pos); + if (storeOffsets) { // don't encode offsets if they are not stored + out.writeVInt(in.startOffset()); + out.writeVInt(in.endOffset()); + } + BytesRef payload = in.getPayload(); + if (payload != null) { + out.writeVInt(payload.length); + out.writeBytes(payload.bytes, payload.offset, payload.length); + } else { + out.writeVInt(0); + } + } + } + + @Override + public int advance(final int target) throws IOException { + // need to support it for checkIndex, but in practice it won't be called, so + // don't bother to implement efficiently for now. + while (nextDoc() < target) {} + return docID(); + } + + @Override + public int docID() { + return docIt >= upto ? NO_MORE_DOCS : docs[docIt]; + } + + @Override + public int endOffset() throws IOException { + return endOffset; + } + + @Override + public int freq() throws IOException { + return currFreq; + } + + @Override + public BytesRef getPayload() throws IOException { + return payload.length == 0 ? null : payload; + } + + @Override + public int nextDoc() throws IOException { + if (++docIt >= upto) return DocIdSetIterator.NO_MORE_DOCS; + postingInput.seek(offsets[docIt]); + currFreq = postingInput.readVInt(); + return docs[docIt]; + } + + @Override + public int nextPosition() throws IOException { + pos = postingInput.readVInt(); + if (storeOffsets) { + startOffset = postingInput.readVInt(); + endOffset = postingInput.readVInt(); + } + int length = postingInput.readVInt(); + if (length > 0) { + if (length >= payload.bytes.length) { + payload.grow(length + 1); + } + postingInput.readBytes(payload.bytes, 0, length); + } + payload.length = length; + return pos; + } + + @Override + public int startOffset() throws IOException { + return startOffset; + } + + /** Returns the wrapped {@link DocsAndPositionsEnum}. */ + DocsAndPositionsEnum getWrapped() { + return in; + } + } + + private final int[] old2new, new2old; + private final FixedBitSet mappedLiveDocs; + + public SortingAtomicReader(final AtomicReader in, final Sorter sorter) throws IOException { + super(in); + old2new = sorter.oldToNew(in); + if (old2new.length != in.maxDoc()) { + throw new IllegalArgumentException("sorter should provide mapping for every document in the index, including deleted ones"); + } + new2old = new int[old2new.length]; + for (int i = 0; i < new2old.length; i++) { + new2old[old2new[i]] = i; + } + + if (!in.hasDeletions()) { + mappedLiveDocs = null; + } else { + mappedLiveDocs = new FixedBitSet(in.maxDoc()); + mappedLiveDocs.set(0, in.maxDoc()); + Bits liveDocs = in.getLiveDocs(); + int len = liveDocs.length(); + for (int i = 0; i < len; i++) { + if (!liveDocs.get(i)) { + mappedLiveDocs.clear(old2new[i]); + } + } + } + } + + @Override + public void document(final int docID, final StoredFieldVisitor visitor) throws IOException { + in.document(new2old[docID], visitor); + } + + @Override + public Fields fields() throws IOException { + Fields fields = in.fields(); + if (fields == null) { + return null; + } else { + return new SortingFields(fields, in.getLiveDocs(), in.getFieldInfos(), old2new); + } + } + + @Override + public BinaryDocValues getBinaryDocValues(String field) throws IOException { + BinaryDocValues oldDocValues = in.getBinaryDocValues(field); + if (oldDocValues == null) { + return null; + } else { + return new SortingBinaryDocValues(oldDocValues, new2old); + } + } + + @Override + public Bits getLiveDocs() { + ensureOpen(); + return mappedLiveDocs; + } + + @Override + public NumericDocValues getNormValues(String field) throws IOException { + final NumericDocValues norm = in.getNormValues(field); + if (norm == null) { + return null; + } else { + return new SortingNumericDocValues(norm, new2old); + } + } + + @Override + public NumericDocValues getNumericDocValues(String field) throws IOException { + final NumericDocValues oldDocValues = in.getNumericDocValues(field); + if (oldDocValues == null) return null; + return new SortingNumericDocValues(oldDocValues, new2old); + } + + @Override + public SortedDocValues getSortedDocValues(String field) throws IOException { + SortedDocValues sortedDV = in.getSortedDocValues(field); + if (sortedDV == null) { + return null; + } else { + return new SortingSortedDocValues(sortedDV, new2old); + } + } + + @Override + public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { + SortedSetDocValues sortedSetDV = in.getSortedSetDocValues(field); + if (sortedSetDV == null) { + return null; + } else { + return new SortingSortedSetDocValues(sortedSetDV, new2old); + } + } + + @Override + public Fields getTermVectors(final int docID) throws IOException { + return in.getTermVectors(new2old[docID]); + } + +} Property changes on: lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/misc/src/java/org/apache/lucene/index/sorter/package.html =================================================================== --- lucene/misc/src/java/org/apache/lucene/index/sorter/package.html (revision 0) +++ lucene/misc/src/java/org/apache/lucene/index/sorter/package.html (working copy) @@ -0,0 +1,27 @@ + + + + +Provides index sorting capablities. The application can use one of the +pre-existing Sorter implementations, e.g. to sort by a numeric +DocValues or reverse the order of the documents. Additionally, the +application can implement a Sorter which returns a permutation on +a source Directory's document IDs, to sort the input documents by additional +values. + + Property changes on: lucene/misc/src/java/org/apache/lucene/index/sorter/package.html ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/misc/src/test/org/apache/lucene/index/sorter/IndexSortingTest.java =================================================================== --- lucene/misc/src/test/org/apache/lucene/index/sorter/IndexSortingTest.java (revision 0) +++ lucene/misc/src/test/org/apache/lucene/index/sorter/IndexSortingTest.java (working copy) @@ -0,0 +1,79 @@ +package org.apache.lucene.index.sorter; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.SlowCompositeReaderWrapper; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util._TestUtil; +import org.junit.BeforeClass; + +public class IndexSortingTest extends SorterTestBase { + + private static final Sorter[] SORTERS = new Sorter[] { + new NumericDocValuesSorter(NUMERIC_DV_FIELD), + Sorter.REVERSE_DOCS, + }; + + @BeforeClass + public static void beforeClassSorterUtilTest() throws Exception { + // only read the values of the undeleted documents, since after addIndexes, + // the deleted ones will be dropped from the index. + Bits liveDocs = reader.getLiveDocs(); + List values = new ArrayList(); + for (int i = 0; i < reader.maxDoc(); i++) { + if (liveDocs == null || liveDocs.get(i)) { + values.add(Integer.valueOf(reader.document(i).get(ID_FIELD))); + } + } + Sorter sorter = SORTERS[random().nextInt(SORTERS.length)]; + if (sorter == Sorter.REVERSE_DOCS) { + Collections.reverse(values); + } else { + Collections.sort(values); + } + sortedValues = values.toArray(new Integer[values.size()]); + if (VERBOSE) { + System.out.println("sortedValues: " + sortedValues); + System.out.println("Sorter: " + sorter); + } + + Directory target = newDirectory(); + IndexWriter writer = new IndexWriter(target, newIndexWriterConfig(TEST_VERSION_CURRENT, null)); + reader = new SortingAtomicReader(reader, sorter); + writer.addIndexes(reader); + writer.close(); + reader.close(); + dir.close(); + + // CheckIndex the target directory + dir = target; + _TestUtil.checkIndex(dir); + + // set reader for tests + reader = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir)); + assertFalse("index should not have deletions", reader.hasDeletions()); + } + +} Property changes on: lucene/misc/src/test/org/apache/lucene/index/sorter/IndexSortingTest.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/misc/src/test/org/apache/lucene/index/sorter/SorterTestBase.java =================================================================== --- lucene/misc/src/test/org/apache/lucene/index/sorter/SorterTestBase.java (revision 0) +++ lucene/misc/src/test/org/apache/lucene/index/sorter/SorterTestBase.java (working copy) @@ -0,0 +1,377 @@ +package org.apache.lucene.index.sorter; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Random; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.SlowCompositeReaderWrapper; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public abstract class SorterTestBase extends LuceneTestCase { + + static final class NormsSimilarity extends Similarity { + + private final Similarity in; + + public NormsSimilarity(Similarity in) { + this.in = in; + } + + @Override + public long computeNorm(FieldInvertState state) { + if (state.getName().equals(NORMS_FIELD)) { + return Float.floatToIntBits(state.getBoost()); + } else { + return in.computeNorm(state); + } + } + + @Override + public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return in.computeWeight(queryBoost, collectionStats, termStats); + } + + @Override + public ExactSimScorer exactSimScorer(SimWeight weight, AtomicReaderContext context) throws IOException { + return in.exactSimScorer(weight, context); + } + + @Override + public SloppySimScorer sloppySimScorer(SimWeight weight, AtomicReaderContext context) throws IOException { + return in.sloppySimScorer(weight, context); + } + + } + + static final class PositionsTokenStream extends TokenStream { + + private final CharTermAttribute term; + private final PayloadAttribute payload; + private final OffsetAttribute offset; + + private int pos, off; + + public PositionsTokenStream() { + term = addAttribute(CharTermAttribute.class); + term.append(DOC_POSITIONS_TERM); + payload = addAttribute(PayloadAttribute.class); + offset = addAttribute(OffsetAttribute.class); + } + + @Override + public boolean incrementToken() throws IOException { + if (pos == 0) { + return false; + } + + payload.setPayload(new BytesRef(Integer.toString(pos))); + offset.setOffset(off, off); + --pos; + ++off; + return true; + } + + void setId(int id) { + pos = id / 10 + 1; + off = 0; + } + } + + protected static final String ID_FIELD = "id"; + protected static final String DOCS_ENUM_FIELD = "docs"; + protected static final String DOCS_ENUM_TERM = "$all$"; + protected static final String DOC_POSITIONS_FIELD = "positions"; + protected static final String DOC_POSITIONS_TERM = "$all$"; + protected static final String NUMERIC_DV_FIELD = "numeric"; + protected static final String NORMS_FIELD = "norm"; + protected static final String BINARY_DV_FIELD = "binary"; + protected static final String SORTED_DV_FIELD = "sorted"; + protected static final String SORTED_SET_DV_FIELD = "sorted_set"; + protected static final String TERM_VECTORS_FIELD = "term_vectors"; + + private static final FieldType TERM_VECTORS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); + static { + TERM_VECTORS_TYPE.setStoreTermVectors(true); + TERM_VECTORS_TYPE.freeze(); + } + + private static final FieldType POSITIONS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); + static { + POSITIONS_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + POSITIONS_TYPE.freeze(); + } + + protected static Directory dir; + protected static AtomicReader reader; + protected static Integer[] sortedValues; + + private static Document doc(final int id, PositionsTokenStream positions) { + final Document doc = new Document(); + doc.add(new StringField(ID_FIELD, Integer.toString(id), Store.YES)); + doc.add(new StringField(DOCS_ENUM_FIELD, DOCS_ENUM_TERM, Store.NO)); + positions.setId(id); + if (doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) { + // codec doesnt support offsets: just index positions for the field + doc.add(new Field(DOC_POSITIONS_FIELD, positions, TextField.TYPE_NOT_STORED)); + } else { + doc.add(new Field(DOC_POSITIONS_FIELD, positions, POSITIONS_TYPE)); + } + doc.add(new NumericDocValuesField(NUMERIC_DV_FIELD, id)); + TextField norms = new TextField(NORMS_FIELD, Integer.toString(id), Store.NO); + norms.setBoost(Float.intBitsToFloat(id)); + doc.add(norms); + doc.add(new BinaryDocValuesField(BINARY_DV_FIELD, new BytesRef(Integer.toString(id)))); + doc.add(new SortedDocValuesField(SORTED_DV_FIELD, new BytesRef(Integer.toString(id)))); + if (defaultCodecSupportsSortedSet()) { + doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id)))); + doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id + 1)))); + } + doc.add(new Field(TERM_VECTORS_FIELD, Integer.toString(id), TERM_VECTORS_TYPE)); + return doc; + } + + /** Creates an index for sorting. */ + public static void createIndex(Directory dir, int numDocs, Random random) throws IOException { + List ids = new ArrayList(); + for (int i = 0; i < numDocs; i++) { + ids.add(Integer.valueOf(i * 10)); + } + // shuffle them for indexing + Collections.shuffle(ids, random); + if (VERBOSE) { + System.out.println("Shuffled IDs for indexing: " + Arrays.toString(ids.toArray())); + } + + PositionsTokenStream positions = new PositionsTokenStream(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)); + conf.setMaxBufferedDocs(4); // create some segments + conf.setSimilarity(new NormsSimilarity(conf.getSimilarity())); // for testing norms field + RandomIndexWriter writer = new RandomIndexWriter(random, dir, conf); + writer.setDoRandomForceMerge(false); + for (int id : ids) { + writer.addDocument(doc(id, positions)); + } + // delete some documents + writer.commit(); + for (Integer id : ids) { + if (random.nextDouble() < 0.2) { + if (VERBOSE) { + System.out.println("delete doc_id " + id); + } + writer.deleteDocuments(new Term(ID_FIELD, id.toString())); + } + } + writer.close(); + } + + @BeforeClass + public static void beforeClassSorterTestBase() throws Exception { + dir = newDirectory(); + int numDocs = atLeast(20); + createIndex(dir, numDocs, random()); + + reader = new SlowCompositeReaderWrapper(DirectoryReader.open(dir)); + } + + @AfterClass + public static void afterClassSorterTestBase() throws Exception { + reader.close(); + dir.close(); + } + + @Test + public void testBinaryDocValuesField() throws Exception { + BinaryDocValues dv = reader.getBinaryDocValues(BINARY_DV_FIELD); + BytesRef bytes = new BytesRef(); + for (int i = 0; i < reader.maxDoc(); i++) { + dv.get(i, bytes); + assertEquals("incorrect binary DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString()); + } + } + + @Test + public void testDocsAndPositionsEnum() throws Exception { + Term term = new Term(DOC_POSITIONS_FIELD, DOC_POSITIONS_TERM); + DocsAndPositionsEnum sortedPositions = reader.termPositionsEnum(term); + int doc; + + // test nextDoc() + while ((doc = sortedPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + int freq = sortedPositions.freq(); + assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq); + for (int i = 0; i < freq; i++) { + assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition()); + if (!doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) { + assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset()); + assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset()); + } + assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString())); + } + } + + // test advance() + sortedPositions = reader.termPositionsEnum(term); + doc = 0; + while ((doc = sortedPositions.advance(doc)) != DocIdSetIterator.NO_MORE_DOCS) { + int freq = sortedPositions.freq(); + assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq); + for (int i = 0; i < freq; i++) { + assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition()); + if (!doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) { + assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset()); + assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset()); + } + assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString())); + } + } + } + + @Test + public void testDocsEnum() throws Exception { + Term term = new Term(DOCS_ENUM_FIELD, DOCS_ENUM_TERM); + DocsEnum docs = reader.termDocsEnum(term); + Bits mappedLiveDocs = reader.getLiveDocs(); + int doc; + int prev = -1; + while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + if (mappedLiveDocs != null) { + assertTrue("document " + doc + " marked as deleted", mappedLiveDocs.get(doc)); + } + assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD))); + while (++prev < doc) { + assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs.get(prev)); + } + } + + docs = reader.termDocsEnum(term); + doc = 0; + prev = -1; + while ((doc = docs.advance(doc)) != DocIdSetIterator.NO_MORE_DOCS) { + if (mappedLiveDocs != null) { + assertTrue("document " + doc + " marked as deleted", mappedLiveDocs.get(doc)); + } + assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD))); + while (++prev < doc) { + assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs.get(prev)); + } + } + } + + @Test + public void testNormValues() throws Exception { + NumericDocValues dv = reader.getNormValues(NORMS_FIELD); + int maxDoc = reader.maxDoc(); + for (int i = 0; i < maxDoc; i++) { + assertEquals("incorrect norm value for doc " + i, sortedValues[i].intValue(), dv.get(i)); + } + } + + @Test + public void testNumericDocValuesField() throws Exception { + NumericDocValues dv = reader.getNumericDocValues(NUMERIC_DV_FIELD); + int maxDoc = reader.maxDoc(); + for (int i = 0; i < maxDoc; i++) { + assertEquals("incorrect numeric DocValues for doc " + i, sortedValues[i].intValue(), dv.get(i)); + } + } + + @Test + public void testSortedDocValuesField() throws Exception { + SortedDocValues dv = reader.getSortedDocValues(SORTED_DV_FIELD); + int maxDoc = reader.maxDoc(); + BytesRef bytes = new BytesRef(); + for (int i = 0; i < maxDoc; i++) { + dv.get(i, bytes); + assertEquals("incorrect sorted DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString()); + } + } + + @Test + public void testSortedSetDocValuesField() throws Exception { + assumeTrue("default codec does not support SORTED_SET", defaultCodecSupportsSortedSet()); + SortedSetDocValues dv = reader.getSortedSetDocValues(SORTED_SET_DV_FIELD); + int maxDoc = reader.maxDoc(); + BytesRef bytes = new BytesRef(); + for (int i = 0; i < maxDoc; i++) { + dv.setDocument(i); + dv.lookupOrd(dv.nextOrd(), bytes); + int value = sortedValues[i].intValue(); + assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value).toString(), bytes.utf8ToString()); + dv.lookupOrd(dv.nextOrd(), bytes); + assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value + 1).toString(), bytes.utf8ToString()); + assertEquals(SortedSetDocValues.NO_MORE_ORDS, dv.nextOrd()); + } + } + + @Test + public void testTermVectors() throws Exception { + int maxDoc = reader.maxDoc(); + for (int i = 0; i < maxDoc; i++) { + Terms terms = reader.getTermVector(i, TERM_VECTORS_FIELD); + assertNotNull("term vectors not found for doc " + i + " field [" + TERM_VECTORS_FIELD + "]", terms); + assertEquals("incorrect term vector for doc " + i, sortedValues[i].toString(), terms.iterator(null).next().utf8ToString()); + } + } + +} Property changes on: lucene/misc/src/test/org/apache/lucene/index/sorter/SorterTestBase.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/misc/src/test/org/apache/lucene/index/sorter/SortingAtomicReaderTest.java =================================================================== --- lucene/misc/src/test/org/apache/lucene/index/sorter/SortingAtomicReaderTest.java (revision 0) +++ lucene/misc/src/test/org/apache/lucene/index/sorter/SortingAtomicReaderTest.java (working copy) @@ -0,0 +1,75 @@ +package org.apache.lucene.index.sorter; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; + +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util._TestUtil; +import org.junit.BeforeClass; + +public class SortingAtomicReaderTest extends SorterTestBase { + + @BeforeClass + public static void beforeClassSortingAtomicReaderTest() throws Exception { + // build the mapping from the reader, since we deleted documents, some of + // them might have disappeared from the index (e.g. if an entire segment is + // dropped b/c all its docs are deleted) + Integer[] values = new Integer[reader.maxDoc()]; + int[] docs = new int[reader.maxDoc()]; + for (int i = 0; i < reader.maxDoc(); i++) { + docs[i] = i; + values[i] = Integer.valueOf(reader.document(i).get(ID_FIELD)); + } + + final int[] oldToNew = Sorter.compute(docs, Collections.unmodifiableList(Arrays.asList(values))); + // Sorter.compute also sorts the values + sortedValues = new Integer[reader.maxDoc()]; + for (int i = 0; i < reader.maxDoc(); ++i) { + sortedValues[oldToNew[i]] = values[i]; + } + if (VERBOSE) { + System.out.println("oldToNew: " + Arrays.toString(oldToNew)); + System.out.println("sortedValues: " + Arrays.toString(sortedValues)); + } + + reader = new SortingAtomicReader(reader, new Sorter() { + @Override + public int[] oldToNew(AtomicReader reader) throws IOException { + return oldToNew; + } + }); + + if (VERBOSE) { + System.out.print("mapped-deleted-docs: "); + Bits mappedLiveDocs = reader.getLiveDocs(); + for (int i = 0; i < mappedLiveDocs.length(); i++) { + if (!mappedLiveDocs.get(i)) { + System.out.print(i + " "); + } + } + System.out.println(); + } + + _TestUtil.checkReader(reader); + } + +} Property changes on: lucene/misc/src/test/org/apache/lucene/index/sorter/SortingAtomicReaderTest.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property