diff --git a/lucene/core/src/java/org/apache/lucene/index/UninvertingFilterReader.java b/lucene/core/src/java/org/apache/lucene/index/UninvertingFilterReader.java new file mode 100644 index 0000000..ca4d9d8 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/UninvertingFilterReader.java @@ -0,0 +1,94 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.FieldCache; + +import java.io.IOException; + +public class UninvertingFilterReader extends FilterAtomicReader { + + /** + * If DocValues are not stored for a given field, the UninvertingFilterReader + * creates them on-the-fly by uninverting the field, and then storing the + * results in the FieldCache + * + * @param in specified base reader. + */ + public UninvertingFilterReader(AtomicReader in) { + super(in); + } + + // TODO -> should we make these caches per-reader, rather than using + // the default FieldCache? + + private static class NumericDocValuesImpl extends NumericDocValues { + + final FieldCache.Longs source; + + NumericDocValuesImpl(FieldCache.Longs source) { + this.source = source; + } + + @Override + public long get(int docID) { + return source.get(docID); + } + } + + @Override + public NumericDocValues getNumericDocValues(String field) throws IOException { + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getNumericDocValues(field); + return new NumericDocValuesImpl(FieldCache.DEFAULT.getLongs(in, field, false)); + } + + @Override + public BinaryDocValues getBinaryDocValues(String field) throws IOException { + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getBinaryDocValues(field); + return FieldCache.DEFAULT.getTerms(in, field); + } + + @Override + public SortedDocValues getSortedDocValues(String field) throws IOException { + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getSortedDocValues(field); + return FieldCache.DEFAULT.getTermsIndex(in, field); + } + + @Override + public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getSortedSetDocValues(field); + return FieldCache.DEFAULT.getDocTermOrds(in, field); + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/DocTermOrdsRangeFilter.java b/lucene/core/src/java/org/apache/lucene/search/DocTermOrdsRangeFilter.java index 07e2cba..0f5bf32 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocTermOrdsRangeFilter.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocTermOrdsRangeFilter.java @@ -16,13 +16,14 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.UninvertingFilterReader; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import java.io.IOException; + /** * A range filter built on top of a cached multi-valued term field (in {@link FieldCache}). * @@ -59,7 +60,8 @@ public abstract class DocTermOrdsRangeFilter extends Filter { return new DocTermOrdsRangeFilter(field, lowerVal, upperVal, includeLower, includeUpper) { @Override public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { - final SortedSetDocValues docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), field); + //final SortedSetDocValues docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), field); + final SortedSetDocValues docTermOrds = new UninvertingFilterReader(context.reader()).getSortedSetDocValues(field); final long lowerPoint = lowerVal == null ? -1 : docTermOrds.lookupTerm(lowerVal); final long upperPoint = upperVal == null ? -1 : docTermOrds.lookupTerm(upperVal); diff --git a/lucene/core/src/java/org/apache/lucene/index/Uninverter.java b/lucene/core/src/java/org/apache/lucene/index/Uninverter.java new file mode 100644 index 0000000..3d8a0f7 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/Uninverter.java @@ -0,0 +1,261 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.packed.GrowableWriter; +import org.apache.lucene.util.packed.PackedInts; + +import java.io.IOException; + +public class Uninverter { + + public static NumericDocValues uninvertNumeric(AtomicReader reader, String field) throws IOException { + return null; //nocommit! + } + + public static BinaryDocValues uninvertBinary(AtomicReader reader, String field) throws IOException { + final int maxDoc = reader.maxDoc(); + Terms terms = reader.terms(field); + + final float acceptableOverheadRatio = PackedInts.FAST; // nocommit pass this in as a parameter? + + final int termCountHardLimit = maxDoc; + + // Holds the actual term data, expanded. + final PagedBytes bytes = new PagedBytes(15); + + int startBPV; + + if (terms != null) { + // Try for coarse estimate for number of bits; this + // should be an underestimate most of the time, which + // is fine -- GrowableWriter will reallocate as needed + long numUniqueTerms = terms.size(); + if (numUniqueTerms != -1L) { + if (numUniqueTerms > termCountHardLimit) { + numUniqueTerms = termCountHardLimit; + } + startBPV = PackedInts.bitsRequired(numUniqueTerms * 4); + } else { + startBPV = 1; + } + } else { + startBPV = 1; + } + + final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio); + + // pointer==0 means not set + bytes.copyUsingLengthPrefix(new BytesRef()); + + if (terms != null) { + int termCount = 0; + final TermsEnum termsEnum = terms.iterator(null); + DocsEnum docs = null; + while(true) { + if (termCount++ == termCountHardLimit) { + // app is misusing the API (there is more than + // one term per doc); in this case we make best + // effort to load what we can (see LUCENE-2142) + break; + } + + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final long pointer = bytes.copyUsingLengthPrefix(term); + docs = termsEnum.docs(null, docs, DocsEnum.FLAG_NONE); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + docToOffset.set(docID, pointer); + } + } + } + + // maybe an int-only impl? + return new BinaryDocValuesImpl(bytes.freeze(true), docToOffset.getMutable()); + } + + private static class BinaryDocValuesImpl extends BinaryDocValues { + private final PagedBytes.Reader bytes; + private final PackedInts.Reader docToOffset; + + public BinaryDocValuesImpl(PagedBytes.Reader bytes, PackedInts.Reader docToOffset) { + this.bytes = bytes; + this.docToOffset = docToOffset; + } + + @Override + public void get(int docID, BytesRef ret) { + final int pointer = (int) docToOffset.get(docID); + if (pointer == 0) { + ret.bytes = MISSING; + ret.offset = 0; + ret.length = 0; + } else { + bytes.fill(ret, pointer); + } + } + } + + public static SortedDocValues uninvertSorted(AtomicReader reader, String field) throws IOException { + final int maxDoc = reader.maxDoc(); + + Terms terms = reader.terms(field); + + final float acceptableOverheadRatio = PackedInts.FAST; // nocommit same as above + + final PagedBytes bytes = new PagedBytes(15); + + int startBytesBPV; + int startTermsBPV; + int startNumUniqueTerms; + + final int termCountHardLimit; + if (maxDoc == Integer.MAX_VALUE) { + termCountHardLimit = Integer.MAX_VALUE; + } else { + termCountHardLimit = maxDoc+1; + } + + // TODO: use Uninvert? + if (terms != null) { + // Try for coarse estimate for number of bits; this + // should be an underestimate most of the time, which + // is fine -- GrowableWriter will reallocate as needed + long numUniqueTerms = terms.size(); + if (numUniqueTerms != -1L) { + if (numUniqueTerms > termCountHardLimit) { + // app is misusing the API (there is more than + // one term per doc); in this case we make best + // effort to load what we can (see LUCENE-2142) + numUniqueTerms = termCountHardLimit; + } + + startBytesBPV = PackedInts.bitsRequired(numUniqueTerms*4); + startTermsBPV = PackedInts.bitsRequired(numUniqueTerms); + + startNumUniqueTerms = (int) numUniqueTerms; + } else { + startBytesBPV = 1; + startTermsBPV = 1; + startNumUniqueTerms = 1; + } + } else { + startBytesBPV = 1; + startTermsBPV = 1; + startNumUniqueTerms = 1; + } + + GrowableWriter termOrdToBytesOffset = new GrowableWriter(startBytesBPV, 1+startNumUniqueTerms, acceptableOverheadRatio); + final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio); + + int termOrd = 0; + + // TODO: use Uninvert? + + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(null); + DocsEnum docs = null; + + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + if (termOrd >= termCountHardLimit) { + break; + } + + if (termOrd == termOrdToBytesOffset.size()) { + // NOTE: this code only runs if the incoming + // reader impl doesn't implement + // size (which should be uncommon) + termOrdToBytesOffset = termOrdToBytesOffset.resize(ArrayUtil.oversize(1 + termOrd, 1)); + } + termOrdToBytesOffset.set(termOrd, bytes.copyUsingLengthPrefix(term)); + docs = termsEnum.docs(null, docs, DocsEnum.FLAG_NONE); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + // Store 1+ ord into packed bits + docToTermOrd.set(docID, 1+termOrd); + } + termOrd++; + } + + if (termOrdToBytesOffset.size() > termOrd) { + termOrdToBytesOffset = termOrdToBytesOffset.resize(termOrd); + } + } + + // maybe an int-only impl? + return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.getMutable(), docToTermOrd.getMutable(), termOrd); + } + + public static class SortedDocValuesImpl extends SortedDocValues { + private final PagedBytes.Reader bytes; + private final PackedInts.Reader termOrdToBytesOffset; + private final PackedInts.Reader docToTermOrd; + private final int numOrd; + + public SortedDocValuesImpl(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, PackedInts.Reader docToTermOrd, int numOrd) { + this.bytes = bytes; + this.docToTermOrd = docToTermOrd; + this.termOrdToBytesOffset = termOrdToBytesOffset; + this.numOrd = numOrd; + } + + @Override + public int getValueCount() { + return numOrd; + } + + @Override + public int getOrd(int docID) { + // Subtract 1, matching the 1+ord we did when + // storing, so that missing values, which are 0 in the + // packed ints, are returned as -1 ord: + return (int) docToTermOrd.get(docID)-1; + } + + @Override + public void lookupOrd(int ord, BytesRef ret) { + if (ord < 0) { + throw new IllegalArgumentException("ord must be >=0 (got ord=" + ord + ")"); + } + bytes.fill(ret, termOrdToBytesOffset.get(ord)); + } + } + + public static SortedSetDocValues uninvertSortedSet(AtomicReader reader, String field) throws IOException { + return new DocTermOrds(reader, null, field).iterator(reader); + } + + +} diff --git a/lucene/core/src/java/org/apache/lucene/index/UninvertingFilterReader.java b/lucene/core/src/java/org/apache/lucene/index/UninvertingFilterReader.java new file mode 100644 index 0000000..d3cc514 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/UninvertingFilterReader.java @@ -0,0 +1,145 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.CloseableThreadLocal; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class UninvertingFilterReader extends FilterAtomicReader { + + /** + * If DocValues are not stored for a given field, the UninvertingFilterReader + * creates them on-the-fly by uninverting the field, and then storing the + * results in the FieldCache + * + * @param in specified base reader. + */ + public UninvertingFilterReader(AtomicReader in, Map fieldTypes) { + super(in); + this.fieldTypes = fieldTypes; + } + + private final Map fieldTypes; + + private final CloseableThreadLocal> dvCache = new CloseableThreadLocal>() { + @Override + protected Map initialValue() { + return new HashMap<>(); + } + }; + + @Override + public NumericDocValues getNumericDocValues(String field) throws IOException { + + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getNumericDocValues(field); + + if (!fieldTypes.containsKey(field)) + return null; + if (fieldTypes.get(field) != FieldInfo.DocValuesType.NUMERIC) + throw new IllegalArgumentException("Cannot uninvert field " + field + " of type " + + fieldTypes.get(field) + " to NumericDocValues"); + + Map uninvertedFields = dvCache.get(); + if (uninvertedFields.containsKey(field)) + return (NumericDocValues) uninvertedFields.get(field); + NumericDocValues dvs = Uninverter.uninvertNumeric(in, field); + uninvertedFields.put(field, dvs); + return dvs; + } + + @Override + public BinaryDocValues getBinaryDocValues(String field) throws IOException { + + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getBinaryDocValues(field); + + if (!fieldTypes.containsKey(field)) + return null; + if (fieldTypes.get(field) != FieldInfo.DocValuesType.BINARY) + throw new IllegalArgumentException("Cannot uninvert field " + field + " of type " + + fieldTypes.get(field) + " to BinaryDocValues"); + + Map uninvertedFields = dvCache.get(); + if (uninvertedFields.containsKey(field)) + return (BinaryDocValues) uninvertedFields.get(field); + BinaryDocValues dvs = Uninverter.uninvertBinary(in, field); + uninvertedFields.put(field, dvs); + return dvs; + } + + @Override + public SortedDocValues getSortedDocValues(String field) throws IOException { + + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getSortedDocValues(field); + + if (!fieldTypes.containsKey(field)) + return null; + if (fieldTypes.get(field) != FieldInfo.DocValuesType.SORTED) + throw new IllegalArgumentException("Cannot uninvert field " + field + " of type " + + fieldTypes.get(field) + " to SortedDocValues"); + + Map uninvertedFields = dvCache.get(); + if (uninvertedFields.containsKey(field)) + return (SortedDocValues) uninvertedFields.get(field); + SortedDocValues dvs = Uninverter.uninvertSorted(in, field); + uninvertedFields.put(field, dvs); + return dvs; + } + + @Override + public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { + + FieldInfo fi = in.getFieldInfos().fieldInfo(field); + if (fi == null) + return null; + if (fi.hasDocValues()) + return super.getSortedSetDocValues(field); + + if (!fieldTypes.containsKey(field)) + return null; + if (fieldTypes.get(field) != FieldInfo.DocValuesType.SORTED_SET) + throw new IllegalArgumentException("Cannot uninvert field " + field + " of type " + + fieldTypes.get(field) + " to SortedSetDocValues"); + + Map uninvertedFields = dvCache.get(); + if (uninvertedFields.containsKey(field)) + return (SortedSetDocValues) uninvertedFields.get(field); + SortedSetDocValues dvs = Uninverter.uninvertSortedSet(in, field); + uninvertedFields.put(field, dvs); + return dvs; + } + + public void purgeCache() { + dvCache.get().clear(); + } + +}