Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1368053) +++ lucene/CHANGES.txt (working copy) @@ -13,7 +13,10 @@ * LUCENE-4249: Changed the explanation of the PayloadTermWeight to use the underlying PayloadFunction's explanation as the explanation for the payload score. (Scott Smerchek via Robert Muir) - + +* LUCENE-4069: Added BloomFilteringPostingsFormat for use with low-frequency terms + such as primary keys (Mark Harwood, Mike McCandless) + * LUCENE-4201: Added JapaneseIterationMarkCharFilter to normalize Japanese iteration marks. (Robert Muir, Christian Moen) Index: lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java (working copy) @@ -0,0 +1,63 @@ +package org.apache.lucene.codecs.bloom; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.FuzzySet; + + +/** + * Class used to create index-time {@link FuzzySet} appropriately configured for + * each field. Also called to right-size bitsets for serialization. + * @lucene.experimental + */ +public abstract class BloomFilterFactory { + + /** + * + * @param state The content to be indexed + * @param info + * the field requiring a BloomFilter + * @return An appropriately sized set or null if no BloomFiltering required + */ + public abstract FuzzySet getSetForField(SegmentWriteState state, FieldInfo info); + + /** + * Called when downsizing bitsets for serialization + * + * @param fieldInfo + * The field with sparse set bits + * @param initialSet + * The bits accumulated + * @return null or a hopefully more densely packed, smaller bitset + */ + public FuzzySet downsize(FieldInfo fieldInfo, FuzzySet initialSet) { + // Aim for a bitset size that would have 10% of bits set (so 90% of searches + // would fail-fast) + float targetMaxSaturation = 0.1f; + return initialSet.downsize(targetMaxSaturation); + } + + /** + * Used to determine if the given filter has reached saturation and should be retired i.e. not saved any more + * @param bloomFilter + * @param fieldInfo + * @return + */ + public abstract boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo); + +} Index: lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java (working copy) @@ -0,0 +1,514 @@ +package org.apache.lucene.codecs.bloom; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsConsumer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.TermStats; +import org.apache.lucene.codecs.TermsConsumer; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FuzzySet; +import org.apache.lucene.util.FuzzySet.ContainsResult; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.hash.MurmurHash2; + +/** + *

+ * A {@link PostingsFormat} useful for low doc-frequency fields such as primary + * keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail" + * for reads in segments known to have no record of the key. A choice of + * delegate PostingsFormat is used to record all other Postings data. + *

+ *

+ * A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter + * settings on a per-field basis. The default configuration is + * {@link DefaultBloomFilterFactory} which allocates a ~8mb bitset and hashes + * values using {@link MurmurHash2}. This should be suitable for most purposes. + *

+ *

+ * The format of the blm file is as follows: + *

+ * + * @lucene.experimental + */ +public class BloomFilteringPostingsFormat extends PostingsFormat { + + public static final String BLOOM_CODEC_NAME = "BloomFilter"; + public static final int BLOOM_CODEC_VERSION = 1; + + /** Extension of Bloom Filters file */ + static final String BLOOM_EXTENSION = "blm"; + + BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory(); + private PostingsFormat delegatePostingsFormat; + + /** + * Creates Bloom filters for a selection of fields created in the index. This + * is recorded as a set of Bitsets held as a segment summary in an additional + * "blm" file. This PostingsFormat delegates to a choice of delegate + * PostingsFormat for encoding all other postings data. + * + * @param delegatePostingsFormat + * The PostingsFormat that records all the non-bloom filter data i.e. + * postings info. + * @param bloomFilterFactory + * The {@link BloomFilterFactory} responsible for sizing BloomFilters + * appropriately + */ + public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat, + BloomFilterFactory bloomFilterFactory) { + super(BLOOM_CODEC_NAME); + this.delegatePostingsFormat = delegatePostingsFormat; + this.bloomFilterFactory = bloomFilterFactory; + } + + /** + * Creates Bloom filters for a selection of fields created in the index. This + * is recorded as a set of Bitsets held as a segment summary in an additional + * "blm" file. This PostingsFormat delegates to a choice of delegate + * PostingsFormat for encoding all other postings data. This choice of + * constructor defaults to the {@link DefaultBloomFilterFactory} for + * configuring per-field BloomFilters. + * + * @param delegatePostingsFormat + * The PostingsFormat that records all the non-bloom filter data i.e. + * postings info. + */ + public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat) { + this(delegatePostingsFormat, new DefaultBloomFilterFactory()); + } + + // Used only by core Lucene at read-time via Service Provider instantiation - + // do not use at Write-time in application code. + public BloomFilteringPostingsFormat() { + super(BLOOM_CODEC_NAME); + } + + public FieldsConsumer fieldsConsumer(SegmentWriteState state) + throws IOException { + if (delegatePostingsFormat == null) { + throw new UnsupportedOperationException("Error - " + getClass().getName() + + " has been constructed without a choice of PostingsFormat"); + } + return new BloomFilteredFieldsConsumer( + delegatePostingsFormat.fieldsConsumer(state), state, + delegatePostingsFormat); + } + + public FieldsProducer fieldsProducer(SegmentReadState state) + throws IOException { + return new BloomFilteredFieldsProducer(state); + } + + public class BloomFilteredFieldsProducer extends FieldsProducer { + private FieldsProducer delegateFieldsProducer; + HashMap bloomsByFieldName = new HashMap(); + + public BloomFilteredFieldsProducer(SegmentReadState state) + throws IOException { + + String bloomFileName = IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION); + IndexInput bloomIn = null; + try { + bloomIn = state.dir.openInput(bloomFileName, state.context); + CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION, + BLOOM_CODEC_VERSION); + // // Load the hash function used in the BloomFilter + // hashFunction = HashFunction.forName(bloomIn.readString()); + // Load the delegate postings format + PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn + .readString()); + + this.delegateFieldsProducer = delegatePostingsFormat + .fieldsProducer(state); + int numBlooms = bloomIn.readInt(); + for (int i = 0; i < numBlooms; i++) { + int fieldNum = bloomIn.readInt(); + FuzzySet bloom = FuzzySet.deserialize(bloomIn); + FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum); + bloomsByFieldName.put(fieldInfo.name, bloom); + } + } finally { + IOUtils.close(bloomIn); + } + + } + + public FieldsEnum iterator() throws IOException { + return new BloomFilteredFieldsEnum(delegateFieldsProducer.iterator(), + bloomsByFieldName); + } + + public void close() throws IOException { + delegateFieldsProducer.close(); + } + + public Terms terms(String field) throws IOException { + FuzzySet filter = bloomsByFieldName.get(field); + if (filter == null) { + return delegateFieldsProducer.terms(field); + } else { + Terms result = delegateFieldsProducer.terms(field); + if (result == null) { + return null; + } + return new BloomFilteredTerms(result, filter); + } + } + + public int size() throws IOException { + return delegateFieldsProducer.size(); + } + + public long getUniqueTermCount() throws IOException { + return delegateFieldsProducer.getUniqueTermCount(); + } + + // Not all fields in a segment may be subject to a bloom filter. This class + // wraps Terms objects appropriately if a filtering request is present + class BloomFilteredFieldsEnum extends FieldsEnum { + private FieldsEnum delegateFieldsEnum; + private HashMap bloomsByFieldName; + private String currentFieldName; + + public BloomFilteredFieldsEnum(FieldsEnum iterator, + HashMap bloomsByFieldName) { + this.delegateFieldsEnum = iterator; + this.bloomsByFieldName = bloomsByFieldName; + } + + public AttributeSource attributes() { + return delegateFieldsEnum.attributes(); + } + + public String next() throws IOException { + currentFieldName = delegateFieldsEnum.next(); + return currentFieldName; + } + + public Terms terms() throws IOException { + FuzzySet filter = bloomsByFieldName.get(currentFieldName); + if (filter == null) { + return delegateFieldsEnum.terms(); + } else { + Terms result = delegateFieldsEnum.terms(); + if (result == null) { + return null; + } + // wrap the terms object with a bloom filter + return new BloomFilteredTerms(result, filter); + } + } + + } + + class BloomFilteredTerms extends Terms { + private Terms delegateTerms; + private FuzzySet filter; + + public BloomFilteredTerms(Terms terms, FuzzySet filter) { + this.delegateTerms = terms; + this.filter = filter; + } + + @Override + public TermsEnum intersect(CompiledAutomaton compiled, + final BytesRef startTerm) throws IOException { + return delegateTerms.intersect(compiled, startTerm); + } + + @Override + public TermsEnum iterator(TermsEnum reuse) throws IOException { + TermsEnum result; + if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) { + // recycle the existing BloomFilteredTermsEnum by asking the delegate + // to recycle its contained TermsEnum + BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse; + if (bfte.filter == filter) { + bfte.delegateTermsEnum = delegateTerms + .iterator(bfte.delegateTermsEnum); + return bfte; + } + } + // We have been handed something we cannot reuse (either null, wrong + // class or wrong filter) so allocate a new object + result = new BloomFilteredTermsEnum(delegateTerms.iterator(reuse), + filter); + return result; + } + + @Override + public Comparator getComparator() throws IOException { + return delegateTerms.getComparator(); + } + + @Override + public long size() throws IOException { + return delegateTerms.size(); + } + + @Override + public long getSumTotalTermFreq() throws IOException { + return delegateTerms.getSumTotalTermFreq(); + } + + @Override + public long getSumDocFreq() throws IOException { + return delegateTerms.getSumDocFreq(); + } + + @Override + public int getDocCount() throws IOException { + return delegateTerms.getDocCount(); + } + } + + class BloomFilteredTermsEnum extends TermsEnum { + + TermsEnum delegateTermsEnum; + private FuzzySet filter; + + public BloomFilteredTermsEnum(TermsEnum iterator, FuzzySet filter) { + this.delegateTermsEnum = iterator; + this.filter = filter; + } + + @Override + public final BytesRef next() throws IOException { + return delegateTermsEnum.next(); + } + + @Override + public final Comparator getComparator() { + return delegateTermsEnum.getComparator(); + } + + @Override + public final boolean seekExact(BytesRef text, boolean useCache) + throws IOException { + // The magical fail-fast speed up that is the entire point of all of + // this code - save a disk seek if there is a match on an in-memory + // structure + // that may occasionally give a false positive but guaranteed no false + // negatives + if (filter.contains(text) == ContainsResult.NO) { + return false; + } + return delegateTermsEnum.seekExact(text, useCache); + } + + @Override + public final SeekStatus seekCeil(BytesRef text, boolean useCache) + throws IOException { + return delegateTermsEnum.seekCeil(text, useCache); + } + + @Override + public final void seekExact(long ord) throws IOException { + delegateTermsEnum.seekExact(ord); + } + + @Override + public final BytesRef term() throws IOException { + return delegateTermsEnum.term(); + } + + @Override + public final long ord() throws IOException { + return delegateTermsEnum.ord(); + } + + @Override + public final int docFreq() throws IOException { + return delegateTermsEnum.docFreq(); + } + + @Override + public final long totalTermFreq() throws IOException { + return delegateTermsEnum.totalTermFreq(); + } + + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, + DocsAndPositionsEnum reuse, int flags) throws IOException { + return delegateTermsEnum.docsAndPositions(liveDocs, reuse, flags); + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) + throws IOException { + return delegateTermsEnum.docs(liveDocs, reuse, flags); + } + + + } + + } + + class BloomFilteredFieldsConsumer extends FieldsConsumer { + private FieldsConsumer delegateFieldsConsumer; + private Map bloomFilters = new HashMap(); + private SegmentWriteState state; + + // private PostingsFormat delegatePostingsFormat; + + public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer, + SegmentWriteState state, PostingsFormat delegatePostingsFormat) { + this.delegateFieldsConsumer = fieldsConsumer; + // this.delegatePostingsFormat=delegatePostingsFormat; + this.state = state; + } + + @Override + public TermsConsumer addField(FieldInfo field) throws IOException { + FuzzySet bloomFilter = bloomFilterFactory.getSetForField(state,field); + if (bloomFilter != null) { + assert bloomFilters.containsKey(field) == false; + bloomFilters.put(field, bloomFilter); + return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field),bloomFilter); + } else { + // No, use the unfiltered fieldsConsumer - we are not interested in + // recording any term Bitsets. + return delegateFieldsConsumer.addField(field); + } + } + + @Override + public void close() throws IOException { + delegateFieldsConsumer.close(); + // Now we are done accumulating values for these fields + List> nonSaturatedBlooms = new ArrayList>(); + + for (Entry entry : bloomFilters.entrySet()) { + FuzzySet bloomFilter = entry.getValue(); + if(!bloomFilterFactory.isSaturated(bloomFilter,entry.getKey())){ + nonSaturatedBlooms.add(entry); + } + } + String bloomFileName = IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION); + IndexOutput bloomOutput = null; + try { + bloomOutput = state.directory + .createOutput(bloomFileName, state.context); + CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME, + BLOOM_CODEC_VERSION); + // remember the name of the postings format we will delegate to + bloomOutput.writeString(delegatePostingsFormat.getName()); + + // First field in the output file is the number of fields+blooms saved + bloomOutput.writeInt(nonSaturatedBlooms.size()); + for (Entry entry : nonSaturatedBlooms) { + FieldInfo fieldInfo = entry.getKey(); + FuzzySet bloomFilter = entry.getValue(); + bloomOutput.writeInt(fieldInfo.number); + saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo); + } + } finally { + IOUtils.close(bloomOutput); + } + //We are done with large bitsets so no need to keep them hanging around + bloomFilters.clear(); + } + + private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput, + FuzzySet bloomFilter, FieldInfo fieldInfo) throws IOException { + + FuzzySet rightSizedSet = bloomFilterFactory.downsize(fieldInfo, + bloomFilter); + if (rightSizedSet == null) { + rightSizedSet = bloomFilter; + } + rightSizedSet.serialize(bloomOutput); + } + + } + + class WrappedTermsConsumer extends TermsConsumer { + private TermsConsumer delegateTermsConsumer; + private FuzzySet bloomFilter; + + public WrappedTermsConsumer(TermsConsumer termsConsumer,FuzzySet bloomFilter) { + this.delegateTermsConsumer = termsConsumer; + this.bloomFilter = bloomFilter; + } + + public PostingsConsumer startTerm(BytesRef text) throws IOException { + return delegateTermsConsumer.startTerm(text); + } + + public void finishTerm(BytesRef text, TermStats stats) throws IOException { + + // Record this term in our BloomFilter + if (stats.docFreq > 0) { + bloomFilter.addValue(text); + } + delegateTermsConsumer.finishTerm(text, stats); + } + + public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) + throws IOException { + delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount); + } + + public Comparator getComparator() throws IOException { + return delegateTermsConsumer.getComparator(); + } + + } + +} Index: lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java (working copy) @@ -0,0 +1,44 @@ +package org.apache.lucene.codecs.bloom; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.FuzzySet; +import org.apache.lucene.util.hash.HashFunction; +import org.apache.lucene.util.hash.MurmurHash2; + +/** + * Default policy is to allocate a bitset with 10% saturation given a unique term per document. + * Bits are set via MurmurHash2 hashing function. + * @lucene.experimental + */ +public class DefaultBloomFilterFactory extends BloomFilterFactory { + + @Override + public FuzzySet getSetForField(SegmentWriteState state,FieldInfo info) { + //Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set with 10% of bits set + return FuzzySet.createSetBasedOnQuality(state.segmentInfo.getDocCount(), 0.10f, new MurmurHash2()); + } + + @Override + public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) { + // Don't bother saving bitsets if >90% of bits are set - we don't want to + // throw any more memory at this problem. + return bloomFilter.getSaturation() > 0.9f; + } + +} Index: lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html (revision 0) +++ lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html (working copy) @@ -0,0 +1,25 @@ + + + + + + + +Codec PostingsFormat for fast access to low-frequency terms such as primary key fields. + + \ No newline at end of file Index: lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java (revision 1368053) +++ lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java (working copy) @@ -53,7 +53,13 @@ /** Reads a segment. NOTE: by the time this call * returns, it must hold open any files it will need to - * use; else, those files may be deleted. */ + * use; else, those files may be deleted. + * Additionally, required files may be deleted during the execution of + * this call before there is a chance to open them. Under these + * circumstances an IOException should be thrown by the implementation. + * IOExceptions are expected and will automatically cause a retry of the + * segment opening logic with the newly revised segments. + * */ public abstract FieldsProducer fieldsProducer(SegmentReadState state) throws IOException; @Override Index: lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java (revision 1368053) +++ lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java (working copy) @@ -54,6 +54,12 @@ bits = new long[bits2words(numBits)]; } + public FixedBitSet(long[]storedBits,int numBits) { + this.numBits = numBits; + this.bits = storedBits; + } + + /** Makes full copy. */ public FixedBitSet(FixedBitSet other) { bits = new long[other.bits.length]; Index: lucene/core/src/java/org/apache/lucene/util/FuzzySet.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/FuzzySet.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/util/FuzzySet.java (working copy) @@ -0,0 +1,292 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.hash.HashFunction; + +/** + *

+ * A class used to represent a set of many, potentially large, values (e.g. many + * long strings such as URLs), using a significantly smaller amount of memory. + *

+ *

+ * The set is "lossy" in that it cannot definitively state that is does contain + * a value but it can definitively say if a value is not in + * the set. It can therefore be used as a Bloom Filter. + *

+ * Another application of the set is that it can be used to perform fuzzy counting because + * it can estimate reasonably accurately how many unique values are contained in the set. + *

+ *

This class is NOT threadsafe.

+ *

+ * Internally a Bitset is used to record values and once a client has finished recording + * a stream of values the {@link #downsize(float)} method can be used to create a suitably smaller set that + * is sized appropriately for the number of values recorded and desired saturation levels. + * + *

+ * @lucene.experimental + */ +public class FuzzySet { + + public static final int FUZZY_SERIALIZATION_VERSION=1; + + public enum ContainsResult { + MAYBE, NO + }; + private HashFunction hashFunction; + private FixedBitSet filter; + private int bloomSize; + + //The sizes of BitSet used are all numbers that, when expressed in binary form, + //are all ones. This is to enable fast downsizing from one bitset to another + //by simply ANDing each set index in one bitset with the size of the target bitset + // - this provides a fast modulo of the number. Values previously accumulated in + // a large bitset and then mapped to a smaller set can be looked up using a single + // AND operation of the query term's hash rather than needing to perform a 2-step + // translation of the query term that mirrors the stored content's reprojections. + static final int usableBitSetSizes[]; + static + { + usableBitSetSizes=new int[30]; + int mask=1; + int size=mask; + for (int i = 0; i < usableBitSetSizes.length; i++) { + size=(size<<1)|mask; + usableBitSetSizes[i]=size; + } + } + + /** + * Rounds down required maxNumberOfBits to the nearest number that is made up + * of all ones as a binary number. + * Use this method where controlling memory use is paramount. + */ + public static int getNearestSetSize(int maxNumberOfBits) + { + int result=usableBitSetSizes[0]; + for (int i = 0; i < usableBitSetSizes.length; i++) { + if(usableBitSetSizes[i]<=maxNumberOfBits) + { + result=usableBitSetSizes[i]; + } + } + return result; + } + + /** + * Use this method to choose a set size where accuracy (low content saturation) is more important + * than deciding how much memory to throw at the problem. + * @param maxNumberOfValuesExpected + * @param desiredSaturation A number between 0 and 1 expressing the % of bits set once all values have been recorded + * @return + */ + public static int getNearestSetSize(int maxNumberOfValuesExpected, + float desiredSaturation) { + // Iterate around the various scales of bitset from smallest to largest looking for the first that + // satisfies value volumes at the chosen saturation level + for (int i = 0; i < usableBitSetSizes.length; i++) { + int numSetBitsAtDesiredSaturation = (int) (usableBitSetSizes[i] * desiredSaturation); + int estimatedNumUniqueValues = getEstimatedNumberUniqueValuesAllowingForCollisions( + usableBitSetSizes[i], numSetBitsAtDesiredSaturation); + if (estimatedNumUniqueValues > maxNumberOfValuesExpected) { + return usableBitSetSizes[i]; + } + } + return -1; + } + + public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes, HashFunction hashFunction) + { + int setSize=getNearestSetSize(maxNumBytes); + return new FuzzySet(new FixedBitSet(setSize+1),setSize,hashFunction); + } + + public static FuzzySet createSetBasedOnQuality(int maxNumUniqueValues, float desiredMaxSaturation, HashFunction hashFunction) + { + int setSize=getNearestSetSize(maxNumUniqueValues,desiredMaxSaturation); + return new FuzzySet(new FixedBitSet(setSize+1),setSize,hashFunction); + } + + + + + private FuzzySet(FixedBitSet filter, int bloomSize, HashFunction hashFunction) { + super(); + this.filter = filter; + this.bloomSize = bloomSize; + this.hashFunction=hashFunction; + } + + /** + * The main method required for a Bloom filter which, given a value determines set membership. + * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false. + * @param value + * @return NO or MAYBE + */ + public ContainsResult contains(BytesRef value) { + int hash = hashFunction.hash(value); + if (hash < 0) { + hash = hash * -1; + } + return mayContainValue(hash); + } + + /** + * Serializes the data set to file using the following format: + *
    + *
  • FuzzySet -->FuzzySetVersion,HashFunctionName,BloomSize, + * NumBitSetWords,BitSetWordNumBitSetWords
  • + *
  • HashFunctionName --> {@link DataOutput#writeString(String) String} The + * name of a ServiceProvider registered {@link HashFunction}
  • + *
  • FuzzySetVersion --> {@link DataOutput#writeInt Uint32} The version number of the {@link FuzzySet} class
  • + *
  • BloomSize --> {@link DataOutput#writeInt Uint32} The modulo value used + * to project hashes into the field's Bitset
  • + *
  • NumBitSetWords --> {@link DataOutput#writeInt Uint32} The number of + * longs (as returned from {@link FixedBitSet#getBits})
  • + *
  • BitSetWord --> {@link DataOutput#writeLong Long} A long from the array + * returned by {@link FixedBitSet#getBits}
  • + *
+ * @param out Data output stream + * @throws IOException + */ + public void serialize(DataOutput out) throws IOException + { + out.writeInt(FUZZY_SERIALIZATION_VERSION); + out.writeString(hashFunction.getName()); + out.writeInt(bloomSize); + long[] bits = filter.getBits(); + out.writeInt(bits.length); + for (int i = 0; i < bits.length; i++) { + // Can't used VLong encoding because cant cope with negative numbers + // output by FixedBitSet + out.writeLong(bits[i]); + } + } + public static FuzzySet deserialize(DataInput in) throws IOException + { + int version=in.readInt(); + if(version!=FUZZY_SERIALIZATION_VERSION) + { + throw new IOException("Error deserializing: set version is not "+FUZZY_SERIALIZATION_VERSION); + } + HashFunction hashFunction=HashFunction.forName(in.readString()); + int bloomSize=in.readInt(); + int numLongs=in.readInt(); + long[]longs=new long[numLongs]; + for (int i = 0; i < numLongs; i++) { + longs[i]=in.readLong(); + } + FixedBitSet bits = new FixedBitSet(longs,bloomSize+1); + return new FuzzySet(bits,bloomSize,hashFunction); + } + + private ContainsResult mayContainValue(int positiveHash) { + assert positiveHash >= 0; + // Bloom sizes are always base 2 and so can be ANDed for a fast modulo + int pos = positiveHash & bloomSize; + if (filter.get(pos)) { + // This term may be recorded in this index (but could be a collision) + return ContainsResult.MAYBE; + } + // definitely NOT in this segment + return ContainsResult.NO; + } + + /** + * Records a value in the set. The referenced bytes are hashed and then modulo n'd where n is the + * chosen size of the internal bitset. + * @param bytes + * @throws IOException + */ + public void addValue(BytesRef value) throws IOException { + int hash = hashFunction.hash(value); + if (hash < 0) { + hash = hash * -1; + } + // Bitmasking using bloomSize is effectively a modulo operation. + int bloomPos = hash & bloomSize; + filter.set(bloomPos); + } + + + /** + * + * @param targetSaturation A number between 0 and 1 describing the % of bits that would ideally be set in the + * result. Lower values have better qccuracy but require more space. + * @return a smaller FuzzySet or null if the current set is already over-saturated + */ + public FuzzySet downsize(float targetMaxSaturation) + { + int numBitsSet = filter.cardinality(); + FixedBitSet rightSizedBitSet = filter; + int rightSizedBitSetSize = bloomSize; + //Hopefully find a smaller size bitset into which we can project accumulated values while maintaining desired saturation level + for (int i = 0; i < usableBitSetSizes.length; i++) { + int candidateBitsetSize = usableBitSetSizes[i]; + float candidateSaturation = (float) numBitsSet + / (float) candidateBitsetSize; + if (candidateSaturation <= targetMaxSaturation) { + rightSizedBitSetSize = candidateBitsetSize; + break; + } + } + // Re-project the numbers to a smaller space if necessary + if (rightSizedBitSetSize < bloomSize) { + // Reset the choice of bitset to the smaller version + rightSizedBitSet = new FixedBitSet(rightSizedBitSetSize + 1); + // Map across the bits from the large set to the smaller one + int bitIndex = 0; + do { + bitIndex = filter.nextSetBit(bitIndex); + if (bitIndex >= 0) { + // Project the larger number into a smaller one effectively + // modulo-ing by using the target bitset size as a mask + int downSizedBitIndex = bitIndex & rightSizedBitSetSize; + rightSizedBitSet.set(downSizedBitIndex); + bitIndex++; + } + } while ( (bitIndex >= 0)&&(bitIndex<=bloomSize)); + } else { + return null; + } + return new FuzzySet(rightSizedBitSet,rightSizedBitSetSize, hashFunction); + } + + public int getEstimatedUniqueValues() + { + return getEstimatedNumberUniqueValuesAllowingForCollisions(bloomSize, filter.cardinality()); + } + + // Given a set size and a the number of set bits, produces an estimate of the number of unique values recorded + public static int getEstimatedNumberUniqueValuesAllowingForCollisions( + int setSize, int numRecordedBits) { + double setSizeAsDouble = setSize; + double numRecordedBitsAsDouble = numRecordedBits; + double saturation = numRecordedBitsAsDouble / setSizeAsDouble; + double logInverseSaturation = Math.log(1 - saturation) * -1; + return (int) (setSizeAsDouble * logInverseSaturation); + } + + public float getSaturation() { + int numBitsSet = filter.cardinality(); + return (float) numBitsSet / (float) bloomSize; + } +} \ No newline at end of file Index: lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java (working copy) @@ -0,0 +1,70 @@ +package org.apache.lucene.util.hash; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.util.Set; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.NamedSPILoader; + + +/** + * Base class for hashing functions that can be referred to by name. + * Subclasses are expected to provide threadsafe implementations of the hash function + * on the range of bytes referenced in the provided {@link BytesRef} + * @lucene.experimental + */ +public abstract class HashFunction implements NamedSPILoader.NamedSPI { + + /** + * Hashes the contents of the referenced bytes + * @param bytes the data to be hashed + * @return the hash of the bytes referenced by bytes.offset and length bytes.length + */ + public abstract int hash(BytesRef bytes); + + private static final NamedSPILoader loader = + new NamedSPILoader(HashFunction.class); + + private final String name; + + public HashFunction(String name) { + NamedSPILoader.checkServiceName(name); + this.name = name; + } + + /** Returns this codec's name */ + @Override + public final String getName() { + return name; + } + + /** looks up a hash function by name */ + public static HashFunction forName(String name) { + return loader.lookup(name); + } + + /** returns a list of all available hash function names */ + public static Set availableHashFunctionNames() { + return loader.availableServices(); + } + + + @Override + public String toString() { + return name; + } +} Index: lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java (working copy) @@ -0,0 +1,105 @@ +package org.apache.lucene.util.hash; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.BytesRef; + +/** + * This is a very fast, non-cryptographic hash suitable for general hash-based + * lookup. See http://murmurhash.googlepages.com/ for more details. + *

+ * The C version of MurmurHash 2.0 found at that site was ported to Java by + * Andrzej Bialecki (ab at getopt org). + *

+ *

+ * The code from getopt.org was adapted by Mark Harwood in the form here as one of a pluggable choice of + * hashing functions as the core function had to be adapted to work with BytesRefs with offsets and lengths + * rather than raw byte arrays. + *

+ * @lucene.experimental + */ +public class MurmurHash2 extends HashFunction{ + + + public static final String HASH_NAME="MurmurHash2"; + + public MurmurHash2() { + super(HASH_NAME); + } + + public static int hash(byte[] data, int seed, int offset, int len) { + int m = 0x5bd1e995; + int r = 24; + int h = seed ^ len; + int len_4 = len >> 2; + for (int i = 0; i < len_4; i++) { + int i_4 = offset + (i << 2); + int k = data[i_4 + 3]; + k = k << 8; + k = k | (data[i_4 + 2] & 0xff); + k = k << 8; + k = k | (data[i_4 + 1] & 0xff); + k = k << 8; + k = k | (data[i_4 + 0] & 0xff); + k *= m; + k ^= k >>> r; + k *= m; + h *= m; + h ^= k; + } + int len_m = len_4 << 2; + int left = len - len_m; + if (left != 0) { + if (left >= 3) { + h ^= data[offset + len - 3] << 16; + } + if (left >= 2) { + h ^= data[offset + len - 2] << 8; + } + if (left >= 1) { + h ^= data[offset + len - 1]; + } + h *= m; + } + h ^= h >>> 13; + h *= m; + h ^= h >>> 15; + return h; + } + + /** + * Generates 32 bit hash from byte array with default seed value. + * + * @param data + * byte array to hash + * @param offset + * the start position in the array to hash + * @param len + * length of the array elements to hash + * @return 32 bit hash of the given array + */ + public static final int hash32(final byte[] data, int offset, int len) { + return MurmurHash2.hash(data, 0x9747b28c, offset, len); + } + + + @Override + public final int hash(BytesRef br) { + return hash32(br.bytes, br.offset, br.length); + } + +} Index: lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat =================================================================== --- lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (revision 1368053) +++ lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (working copy) @@ -17,4 +17,5 @@ org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat org.apache.lucene.codecs.memory.MemoryPostingsFormat +org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat org.apache.lucene.codecs.memory.DirectPostingsFormat Index: lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction =================================================================== --- lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction (revision 0) +++ lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction (working copy) @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.util.hash.MurmurHash2 Index: lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java (revision 0) +++ lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java (working copy) @@ -0,0 +1,77 @@ +package org.apache.lucene.codecs.bloom; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat; +import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.FuzzySet; +import org.apache.lucene.util.hash.MurmurHash2; + +/** + * A class used for testing {@link BloomFilteringPostingsFormat} with a concrete + * delegate (Lucene40). Creates a Bloom filter on ALL fields and with tiny + * amounts of memory reserved for the filter. DO NOT USE IN A PRODUCTION + * APPLICATION This is not a realistic application of Bloom Filters as they + * ordinarily are larger and operate on only primary key type fields. + */ +public class TestBloomFilteredLucene40Postings extends PostingsFormat { + + private BloomFilteringPostingsFormat delegate; + + // Special class used to avoid OOM exceptions where Junit tests create many + // fields. + static class LowMemoryBloomFactory extends BloomFilterFactory { + @Override + public FuzzySet getSetForField(SegmentWriteState state,FieldInfo info) { + return FuzzySet.createSetBasedOnMaxMemory(1024, new MurmurHash2()); + } + + @Override + public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) { + // For test purposes always maintain the BloomFilter - even past the point + // of usefulness when all bits are set + return false; + } + } + + public TestBloomFilteredLucene40Postings() { + super("TestBloomFilteredLucene40Postings"); + delegate = new BloomFilteringPostingsFormat(new Lucene40PostingsFormat(), + new LowMemoryBloomFactory()); + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) + throws IOException { + return delegate.fieldsConsumer(state); + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) + throws IOException { + return delegate.fieldsProducer(state); + } +} Index: lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java (revision 1368053) +++ lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java (working copy) @@ -29,6 +29,7 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.asserting.AssertingPostingsFormat; +import org.apache.lucene.codecs.bloom.TestBloomFilteredLucene40Postings; import org.apache.lucene.codecs.lucene40.Lucene40Codec; import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; import org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds; @@ -96,6 +97,10 @@ new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock), // add pulsing again with (usually) different parameters new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock), + //TODO as a PostingsFormat which wraps others, we should allow TestBloomFilteredLucene40Postings to be constructed + //with a choice of concrete PostingsFormats. Maybe useful to have a generic means of marking and dealing + //with such "wrapper" classes? + new TestBloomFilteredLucene40Postings(), new MockSepPostingsFormat(), new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)), new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)), Index: lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat =================================================================== --- lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (revision 1368053) +++ lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (working copy) @@ -20,5 +20,6 @@ org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat org.apache.lucene.codecs.ramonly.RAMOnlyPostingsFormat org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds +org.apache.lucene.codecs.bloom.TestBloomFilteredLucene40Postings org.apache.lucene.codecs.asserting.AssertingPostingsFormat