diff --git a/lucene/core/src/java/org/apache/lucene/index/Uninverter.java b/lucene/core/src/java/org/apache/lucene/index/Uninverter.java new file mode 100644 index 0000000..5aaef11 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/Uninverter.java @@ -0,0 +1,289 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.packed.GrowableWriter; +import org.apache.lucene.util.packed.PackedInts; + +import java.io.IOException; + +public class Uninverter { + + public static NumericDocValues uninvertNumeric(AtomicReader reader, String field) throws IOException { + + Terms terms = reader.terms(field); + if (terms == null) + return NumericDocValues.EMPTY; + + final int maxDoc = reader.maxDoc(); + final long[] values = new long[maxDoc]; + + final TermsEnum te = NumericUtils.filterPrefixCodedLongs(terms.iterator(null)); + DocsEnum de = null; + BytesRef term = null; + + while ((term = te.next()) != null) { + long value = NumericUtils.prefixCodedToLong(term); + de = te.docs(null, de, DocsEnum.FLAG_NONE); + int doc; + while ((doc = de.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + values[doc] = value; + } + } + + return new NumericDocValues() { + @Override + public long get(int docID) { + return values[docID]; + } + }; + + } + + public static BinaryDocValues uninvertBinary(AtomicReader reader, String field) throws IOException { + final int maxDoc = reader.maxDoc(); + Terms terms = reader.terms(field); + + final float acceptableOverheadRatio = PackedInts.FAST; // nocommit pass this in as a parameter? + + final int termCountHardLimit = maxDoc; + + // Holds the actual term data, expanded. + final PagedBytes bytes = new PagedBytes(15); + + int startBPV; + + if (terms != null) { + // Try for coarse estimate for number of bits; this + // should be an underestimate most of the time, which + // is fine -- GrowableWriter will reallocate as needed + long numUniqueTerms = terms.size(); + if (numUniqueTerms != -1L) { + if (numUniqueTerms > termCountHardLimit) { + numUniqueTerms = termCountHardLimit; + } + startBPV = PackedInts.bitsRequired(numUniqueTerms * 4); + } else { + startBPV = 1; + } + } else { + startBPV = 1; + } + + final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio); + + // pointer==0 means not set + bytes.copyUsingLengthPrefix(new BytesRef()); + + if (terms != null) { + int termCount = 0; + final TermsEnum termsEnum = terms.iterator(null); + DocsEnum docs = null; + while(true) { + if (termCount++ == termCountHardLimit) { + // app is misusing the API (there is more than + // one term per doc); in this case we make best + // effort to load what we can (see LUCENE-2142) + break; + } + + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final long pointer = bytes.copyUsingLengthPrefix(term); + docs = termsEnum.docs(null, docs, DocsEnum.FLAG_NONE); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + docToOffset.set(docID, pointer); + } + } + } + + // maybe an int-only impl? + return new BinaryDocValuesImpl(bytes.freeze(true), docToOffset.getMutable()); + } + + private static class BinaryDocValuesImpl extends BinaryDocValues { + private final PagedBytes.Reader bytes; + private final PackedInts.Reader docToOffset; + + public BinaryDocValuesImpl(PagedBytes.Reader bytes, PackedInts.Reader docToOffset) { + this.bytes = bytes; + this.docToOffset = docToOffset; + } + + @Override + public void get(int docID, BytesRef ret) { + final int pointer = (int) docToOffset.get(docID); + if (pointer == 0) { + ret.bytes = MISSING; + ret.offset = 0; + ret.length = 0; + } else { + bytes.fill(ret, pointer); + } + } + } + + public static SortedDocValues uninvertSorted(AtomicReader reader, String field) throws IOException { + final int maxDoc = reader.maxDoc(); + + Terms terms = reader.terms(field); + + final float acceptableOverheadRatio = PackedInts.FAST; // nocommit same as above + + final PagedBytes bytes = new PagedBytes(15); + + int startBytesBPV; + int startTermsBPV; + int startNumUniqueTerms; + + final int termCountHardLimit; + if (maxDoc == Integer.MAX_VALUE) { + termCountHardLimit = Integer.MAX_VALUE; + } else { + termCountHardLimit = maxDoc+1; + } + + // TODO: use Uninvert? + if (terms != null) { + // Try for coarse estimate for number of bits; this + // should be an underestimate most of the time, which + // is fine -- GrowableWriter will reallocate as needed + long numUniqueTerms = terms.size(); + if (numUniqueTerms != -1L) { + if (numUniqueTerms > termCountHardLimit) { + // app is misusing the API (there is more than + // one term per doc); in this case we make best + // effort to load what we can (see LUCENE-2142) + numUniqueTerms = termCountHardLimit; + } + + startBytesBPV = PackedInts.bitsRequired(numUniqueTerms*4); + startTermsBPV = PackedInts.bitsRequired(numUniqueTerms); + + startNumUniqueTerms = (int) numUniqueTerms; + } else { + startBytesBPV = 1; + startTermsBPV = 1; + startNumUniqueTerms = 1; + } + } else { + startBytesBPV = 1; + startTermsBPV = 1; + startNumUniqueTerms = 1; + } + + GrowableWriter termOrdToBytesOffset = new GrowableWriter(startBytesBPV, 1+startNumUniqueTerms, acceptableOverheadRatio); + final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio); + + int termOrd = 0; + + // TODO: use Uninvert? + + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(null); + DocsEnum docs = null; + + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + if (termOrd >= termCountHardLimit) { + break; + } + + if (termOrd == termOrdToBytesOffset.size()) { + // NOTE: this code only runs if the incoming + // reader impl doesn't implement + // size (which should be uncommon) + termOrdToBytesOffset = termOrdToBytesOffset.resize(ArrayUtil.oversize(1 + termOrd, 1)); + } + termOrdToBytesOffset.set(termOrd, bytes.copyUsingLengthPrefix(term)); + docs = termsEnum.docs(null, docs, DocsEnum.FLAG_NONE); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + // Store 1+ ord into packed bits + docToTermOrd.set(docID, 1+termOrd); + } + termOrd++; + } + + if (termOrdToBytesOffset.size() > termOrd) { + termOrdToBytesOffset = termOrdToBytesOffset.resize(termOrd); + } + } + + // maybe an int-only impl? + return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.getMutable(), docToTermOrd.getMutable(), termOrd); + } + + public static class SortedDocValuesImpl extends SortedDocValues { + private final PagedBytes.Reader bytes; + private final PackedInts.Reader termOrdToBytesOffset; + private final PackedInts.Reader docToTermOrd; + private final int numOrd; + + public SortedDocValuesImpl(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, PackedInts.Reader docToTermOrd, int numOrd) { + this.bytes = bytes; + this.docToTermOrd = docToTermOrd; + this.termOrdToBytesOffset = termOrdToBytesOffset; + this.numOrd = numOrd; + } + + @Override + public int getValueCount() { + return numOrd; + } + + @Override + public int getOrd(int docID) { + // Subtract 1, matching the 1+ord we did when + // storing, so that missing values, which are 0 in the + // packed ints, are returned as -1 ord: + return (int) docToTermOrd.get(docID)-1; + } + + @Override + public void lookupOrd(int ord, BytesRef ret) { + if (ord < 0) { + throw new IllegalArgumentException("ord must be >=0 (got ord=" + ord + ")"); + } + bytes.fill(ret, termOrdToBytesOffset.get(ord)); + } + } + + public static SortedSetDocValues uninvertSortedSet(AtomicReader reader, String field) throws IOException { + return new DocTermOrds(reader, null, field).iterator(reader); + } + + +} diff --git a/lucene/core/src/java/org/apache/lucene/index/UninvertingFilterReader.java b/lucene/core/src/java/org/apache/lucene/index/UninvertingFilterReader.java new file mode 100644 index 0000000..0f831a8 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/UninvertingFilterReader.java @@ -0,0 +1,145 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.CloseableThreadLocal; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class UninvertingFilterReader extends FilterAtomicReader { + + /** + * If DocValues are not stored for a given field, the UninvertingFilterReader + * creates them on-the-fly by uninverting the field, and then cacheing the + * results. + * + * @param in specified base reader. + */ + public UninvertingFilterReader(AtomicReader in, Map fieldTypes) { + super(in); + this.fieldTypes = fieldTypes; + } + + private final Map fieldTypes; + + private final CloseableThreadLocal> dvCache = new CloseableThreadLocal>() { + @Override + protected Map initialValue() { + return new HashMap<>(); + } + }; + + @Override + public NumericDocValues getNumericDocValues(String field) throws IOException { + + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getNumericDocValues(field); + + if (!fieldTypes.containsKey(field)) + return null; + if (fieldTypes.get(field) != FieldInfo.DocValuesType.NUMERIC) + throw new IllegalArgumentException("Cannot uninvert field " + field + " of type " + + fieldTypes.get(field) + " to NumericDocValues"); + + Map uninvertedFields = dvCache.get(); + if (uninvertedFields.containsKey(field)) + return (NumericDocValues) uninvertedFields.get(field); + NumericDocValues dvs = Uninverter.uninvertNumeric(in, field); + uninvertedFields.put(field, dvs); + return dvs; + } + + @Override + public BinaryDocValues getBinaryDocValues(String field) throws IOException { + + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getBinaryDocValues(field); + + if (!fieldTypes.containsKey(field)) + return null; + if (fieldTypes.get(field) != FieldInfo.DocValuesType.BINARY) + throw new IllegalArgumentException("Cannot uninvert field " + field + " of type " + + fieldTypes.get(field) + " to BinaryDocValues"); + + Map uninvertedFields = dvCache.get(); + if (uninvertedFields.containsKey(field)) + return (BinaryDocValues) uninvertedFields.get(field); + BinaryDocValues dvs = Uninverter.uninvertBinary(in, field); + uninvertedFields.put(field, dvs); + return dvs; + } + + @Override + public SortedDocValues getSortedDocValues(String field) throws IOException { + + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getSortedDocValues(field); + + if (!fieldTypes.containsKey(field)) + return null; + if (fieldTypes.get(field) != FieldInfo.DocValuesType.SORTED) + throw new IllegalArgumentException("Cannot uninvert field " + field + " of type " + + fieldTypes.get(field) + " to SortedDocValues"); + + Map uninvertedFields = dvCache.get(); + if (uninvertedFields.containsKey(field)) + return (SortedDocValues) uninvertedFields.get(field); + SortedDocValues dvs = Uninverter.uninvertSorted(in, field); + uninvertedFields.put(field, dvs); + return dvs; + } + + @Override + public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { + + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getSortedSetDocValues(field); + + if (!fieldTypes.containsKey(field)) + return null; + if (fieldTypes.get(field) != FieldInfo.DocValuesType.SORTED_SET) + throw new IllegalArgumentException("Cannot uninvert field " + field + " of type " + + fieldTypes.get(field) + " to SortedSetDocValues"); + + Map uninvertedFields = dvCache.get(); + if (uninvertedFields.containsKey(field)) + return (SortedSetDocValues) uninvertedFields.get(field); + SortedSetDocValues dvs = Uninverter.uninvertSortedSet(in, field); + uninvertedFields.put(field, dvs); + return dvs; + } + + public void purgeCache() { + dvCache.get().clear(); + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java b/lucene/core/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java index dbdd181..fe67906 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java +++ b/lucene/core/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java @@ -17,16 +17,16 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.DocsEnum; // javadoc @link +import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; +import java.io.IOException; + /** * A {@link Filter} that only accepts documents whose single * term value in the specified field is contained in the diff --git a/lucene/core/src/test/org/apache/lucene/index/TestUninverter.java b/lucene/core/src/test/org/apache/lucene/index/TestUninverter.java new file mode 100644 index 0000000..03c20cc --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestUninverter.java @@ -0,0 +1,151 @@ +package org.apache.lucene.index; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoubleField; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntField; +import org.apache.lucene.document.LongField; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.NumericUtils; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class TestUninverter extends LuceneTestCase { + + public void testUninvertLongField() throws IOException { + + Directory directory = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(random(), TEST_VERSION_CURRENT, null); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, iwc); + + Document doc = new Document(); + Field longField = new LongField("field", 0, Field.Store.NO); + doc.add(longField); + + int numDocs = atLeast(1000); + long[] values = new long[numDocs]; + for (int i = 0; i < numDocs; i++) { + long value = random().nextLong(); + longField.setLongValue(value); + values[i] = value; + iw.addDocument(doc); + } + + iw.commit(); + iw.forceMerge(1); + AtomicReader reader = iw.getReader().leaves().get(0).reader(); + + Map uninverterFields = new HashMap<>(); + uninverterFields.put("field", FieldInfo.DocValuesType.NUMERIC); + UninvertingFilterReader filterReader = new UninvertingFilterReader(reader, uninverterFields); + + NumericDocValues dv = filterReader.getNumericDocValues("field"); + for (int i = 0; i < numDocs; i++) { + assertEquals(values[i], dv.get(i)); + } + + reader.close(); + iw.close(); + directory.close(); + + } + + public void testUninvertDoubleField() throws IOException { + + Directory directory = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(random(), TEST_VERSION_CURRENT, null); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, iwc); + + Document doc = new Document(); + Field longField = new DoubleField("field", 0, Field.Store.NO); + doc.add(longField); + + int numDocs = atLeast(1000); + double[] values = new double[numDocs]; + for (int i = 0; i < numDocs; i++) { + double value = random().nextDouble(); + longField.setDoubleValue(value); + values[i] = value; + iw.addDocument(doc); + } + + iw.commit(); + iw.forceMerge(1); + AtomicReader reader = iw.getReader().leaves().get(0).reader(); + + Map uninverterFields = new HashMap<>(); + uninverterFields.put("field", FieldInfo.DocValuesType.NUMERIC); + UninvertingFilterReader filterReader = new UninvertingFilterReader(reader, uninverterFields); + + NumericDocValues dv = filterReader.getNumericDocValues("field"); + for (int i = 0; i < numDocs; i++) { + assertEquals(values[i], NumericUtils.sortableLongToDouble(dv.get(i)), 0.0000000001f); + } + + reader.close(); + iw.close(); + directory.close(); + + } + + public void testUninvertIntField() throws IOException { + + Directory directory = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(random(), TEST_VERSION_CURRENT, null); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, iwc); + + Document doc = new Document(); + Field longField = new IntField("field", 0, Field.Store.NO); + doc.add(longField); + + int numDocs = atLeast(1000); + int[] values = new int[numDocs]; + for (int i = 0; i < numDocs; i++) { + int value = random().nextInt(); + longField.setIntValue(value); + values[i] = value; + iw.addDocument(doc); + } + + iw.commit(); + iw.forceMerge(1); + AtomicReader reader = iw.getReader().leaves().get(0).reader(); + + Map uninverterFields = new HashMap<>(); + uninverterFields.put("field", FieldInfo.DocValuesType.NUMERIC); + UninvertingFilterReader filterReader = new UninvertingFilterReader(reader, uninverterFields); + + NumericDocValues dv = filterReader.getNumericDocValues("field"); + for (int i = 0; i < numDocs; i++) { + assertEquals(values[i], dv.get(i)); + } + + reader.close(); + iw.close(); + directory.close(); + + } + +}