diff --git a/src/main/java/org/apache/hadoop/hbase/util/DynamicByteBloomFilter.java b/src/main/java/org/apache/hadoop/hbase/util/DynamicByteBloomFilter.java deleted file mode 100644 index f818279..0000000 --- a/src/main/java/org/apache/hadoop/hbase/util/DynamicByteBloomFilter.java +++ /dev/null @@ -1,334 +0,0 @@ -/** - * - * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org) - * All rights reserved. - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - Neither the name of the University Catholique de Louvain - UCL - * nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.util; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.nio.ByteBuffer; - -import org.apache.hadoop.io.Writable; - -/** - * Implements a dynamic Bloom filter, as defined in the INFOCOM 2006 paper. - *
- * A dynamic Bloom filter (DBF) makes use of a s * m bit matrix but
- * each of the s rows is a standard Bloom filter. The creation
- * process of a DBF is iterative. At the start, the DBF is a 1 * m
- * bit matrix, i.e., it is composed of a single standard Bloom filter.
- * It assumes that nr elements are recorded in the
- * initial bit vector, where nr <= n (n is
- * the cardinality of the set A to record in the filter).
- *
- * As the size of A grows during the execution of the application,
- * several keys must be inserted in the DBF. When inserting a key into the DBF,
- * one must first get an active Bloom filter in the matrix. A Bloom filter is
- * active when the number of recorded keys, nr, is
- * strictly less than the current cardinality of A, n.
- * If an active Bloom filter is found, the key is inserted and
- * nr is incremented by one. On the other hand, if there
- * is no active Bloom filter, a new one is created (i.e., a new row is added to
- * the matrix) according to the current size of A and the element
- * is added in this new Bloom filter and the nr value of
- * this new Bloom filter is set to one. A given key is said to belong to the
- * DBF if the k positions are set to one in one of the matrix rows.
- *
- * Originally created by
- * European Commission One-Lab Project 034819.
- *
- * @see BloomFilter A Bloom filter
- *
- * @see Theory and Network Applications of Dynamic Bloom Filters
- */
-public class DynamicByteBloomFilter implements BloomFilter {
- /** Current file format version */
- public static final int VERSION = 2;
- /** Maximum number of keys in a dynamic Bloom filter row. */
- protected final int keyInterval;
- /** The maximum false positive rate per bloom */
- protected final float errorRate;
- /** Hash type */
- protected final int hashType;
- /** The number of keys recorded in the current Bloom filter. */
- protected int curKeys;
- /** expected size of bloom filter matrix (used during reads) */
- protected int readMatrixSize;
- /** The matrix of Bloom filters (contains bloom data only during writes). */
- protected ByteBloomFilter[] matrix;
-
- /**
- * Normal read constructor. Loads bloom filter meta data.
- * @param meta stored bloom meta data
- * @throws IllegalArgumentException meta data is invalid
- */
- public DynamicByteBloomFilter(ByteBuffer meta)
- throws IllegalArgumentException {
- int version = meta.getInt();
- if (version != VERSION) throw new IllegalArgumentException("Bad version");
-
- this.keyInterval = meta.getInt();
- this.errorRate = meta.getFloat();
- this.hashType = meta.getInt();
- this.readMatrixSize = meta.getInt();
- this.curKeys = meta.getInt();
-
- readSanityCheck();
-
- this.matrix = new ByteBloomFilter[1];
- this.matrix[0] = new ByteBloomFilter(keyInterval, errorRate, hashType, 0);
-}
-
- /**
- * Normal write constructor. Note that this doesn't allocate bloom data by
- * default. Instead, call allocBloom() before adding entries.
- * @param bitSize The vector size of this filter.
- * @param functionCount The number of hash function to consider.
- * @param hashType type of the hashing function (see
- * {@link org.apache.hadoop.util.hash.Hash}).
- * @param keyInterval Maximum number of keys to record per Bloom filter row.
- * @throws IllegalArgumentException The input parameters were invalid
- */
- public DynamicByteBloomFilter(int keyInterval, float errorRate, int hashType)
- throws IllegalArgumentException {
- this.keyInterval = keyInterval;
- this.errorRate = errorRate;
- this.hashType = hashType;
- this.curKeys = 0;
-
- if(keyInterval <= 0) {
- throw new IllegalArgumentException("keyCount must be > 0");
- }
-
- this.matrix = new ByteBloomFilter[1];
- this.matrix[0] = new ByteBloomFilter(keyInterval, errorRate, hashType, 0);
-}
-
- @Override
- public void allocBloom() {
- this.matrix[0].allocBloom();
- }
-
- void readSanityCheck() throws IllegalArgumentException {
- if (this.curKeys <= 0) {
- throw new IllegalArgumentException("last bloom's key count invalid");
- }
-
- if (this.readMatrixSize <= 0) {
- throw new IllegalArgumentException("matrix size must be known");
- }
- }
-
- @Override
- public void add(byte []buf, int offset, int len) {
- BloomFilter bf = getCurBloom();
-
- if (bf == null) {
- addRow();
- bf = matrix[matrix.length - 1];
- curKeys = 0;
- }
-
- bf.add(buf, offset, len);
- curKeys++;
- }
-
- @Override
- public void add(byte []buf) {
- add(buf, 0, buf.length);
- }
-
- /**
- * Should only be used in tests when writing a bloom filter.
- */
- boolean contains(byte [] buf) {
- return contains(buf, 0, buf.length);
- }
-
- /**
- * Should only be used in tests when writing a bloom filter.
- */
- boolean contains(byte [] buf, int offset, int length) {
- for (int i = 0; i < matrix.length; i++) {
- if (matrix[i].contains(buf, offset, length)) {
- return true;
- }
- }
- return false;
- }
-
- @Override
- public boolean contains(byte [] buf, ByteBuffer theBloom) {
- return contains(buf, 0, buf.length, theBloom);
- }
-
- @Override
- public boolean contains(byte[] buf, int offset, int length,
- ByteBuffer theBloom) {
- if(offset + length > buf.length) {
- return false;
- }
-
- // current version assumes uniform size
- int bytesPerBloom = this.matrix[0].getByteSize();
-
- if(theBloom.limit() != bytesPerBloom * readMatrixSize) {
- throw new IllegalArgumentException("Bloom does not match expected size");
- }
-
- ByteBuffer tmp = theBloom.duplicate();
-
- // note: actually searching an array of blooms that have been serialized
- for (int m = 0; m < readMatrixSize; ++m) {
- tmp.position(m* bytesPerBloom);
- tmp.limit(tmp.position() + bytesPerBloom);
- boolean match = this.matrix[0].contains(buf, offset, length, tmp.slice());
- if (match) {
- return true;
- }
- }
-
- // matched no bloom filters
- return false;
- }
-
- int bloomCount() {
- return Math.max(this.matrix.length, this.readMatrixSize);
- }
-
- @Override
- public int getKeyCount() {
- return (bloomCount()-1) * this.keyInterval + this.curKeys;
- }
-
- @Override
- public int getMaxKeys() {
- return bloomCount() * this.keyInterval;
- }
-
- @Override
- public int getByteSize() {
- return bloomCount() * this.matrix[0].getByteSize();
- }
-
- @Override
- public void finalize() {
- }
-
- /**
- * Adds a new row to this dynamic Bloom filter.
- */
- private void addRow() {
- ByteBloomFilter[] tmp = new ByteBloomFilter[matrix.length + 1];
-
- for (int i = 0; i < matrix.length; i++) {
- tmp[i] = matrix[i];
- }
-
- tmp[tmp.length-1] = new ByteBloomFilter(keyInterval, errorRate, hashType, 0);
- tmp[tmp.length-1].allocBloom();
- matrix = tmp;
- }
-
- /**
- * Returns the currently-unfilled row in the dynamic Bloom Filter array.
- * @return BloomFilter The active standard Bloom filter.
- * Null otherwise.
- */
- private BloomFilter getCurBloom() {
- if (curKeys >= keyInterval) {
- return null;
- }
-
- return matrix[matrix.length - 1];
- }
-
- @Override
- public Writable getMetaWriter() {
- return new MetaWriter();
- }
-
- @Override
- public Writable getDataWriter() {
- return new DataWriter();
- }
-
- private class MetaWriter implements Writable {
- protected MetaWriter() {}
- @Override
- public void readFields(DataInput arg0) throws IOException {
- throw new IOException("Cant read with this class.");
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeInt(VERSION);
- out.writeInt(keyInterval);
- out.writeFloat(errorRate);
- out.writeInt(hashType);
- out.writeInt(matrix.length);
- out.writeInt(curKeys);
- }
- }
-
- private class DataWriter implements Writable {
- protected DataWriter() {}
- @Override
- public void readFields(DataInput arg0) throws IOException {
- throw new IOException("Cant read with this class.");
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- for (int i = 0; i < matrix.length; ++i) {
- matrix[i].writeBloom(out);
- }
- }
- }
-}
diff --git a/src/test/java/org/apache/hadoop/hbase/util/TestByteBloomFilter.java b/src/test/java/org/apache/hadoop/hbase/util/TestByteBloomFilter.java
index a082af6..5f62b8c 100644
--- a/src/test/java/org/apache/hadoop/hbase/util/TestByteBloomFilter.java
+++ b/src/test/java/org/apache/hadoop/hbase/util/TestByteBloomFilter.java
@@ -20,13 +20,9 @@
package org.apache.hadoop.hbase.util;
-import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
-import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.nio.ByteBuffer;
-import java.util.BitSet;
-import java.util.Random;
import junit.framework.TestCase;
@@ -141,61 +137,4 @@ public class TestByteBloomFilter extends TestCase {
// test: foldFactor > log(max/actual)
}
-
- public void testDynamicBloom() throws Exception {
- int keyInterval = 1000;
- float err = (float)0.01;
- BitSet valid = new BitSet(keyInterval*4);
-
- DynamicByteBloomFilter bf1 = new DynamicByteBloomFilter(keyInterval, err,
- Hash.MURMUR_HASH);
- bf1.allocBloom();
-
- long seed = System.currentTimeMillis();
- Random r = new Random(seed);
- System.out.println("seed = " + seed);
-
- for (int i = 0; i < keyInterval*4; ++i) { // add
- if (r.nextBoolean()) {
- bf1.add(Bytes.toBytes(i));
- valid.set(i);
-
- // we assume only 2 blooms in this test, so exit before a 3rd is made
- if (bf1.getKeyCount() == 2000) {
- break;
- }
- }
- }
- assertTrue(2 <= bf1.bloomCount());
- System.out.println("keys added = " + bf1.getKeyCount());
-
- // test serialization/deserialization
- ByteArrayOutputStream metaOut = new ByteArrayOutputStream();
- ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
- bf1.getMetaWriter().write(new DataOutputStream(metaOut));
- bf1.getDataWriter().write(new DataOutputStream(dataOut));
- ByteBuffer bb = ByteBuffer.wrap(dataOut.toByteArray());
- DynamicByteBloomFilter newBf1 = new DynamicByteBloomFilter(
- ByteBuffer.wrap(metaOut.toByteArray()));
-
- int falsePositives = 0;
- for (int i = 0; i < keyInterval*4; ++i) { // check
- if (newBf1.contains(Bytes.toBytes(i), bb)) {
- if (!valid.get(i)) ++falsePositives;
- } else {
- if (valid.get(i)) {
- assert false;
- }
- }
- }
-
- // Dynamic Blooms are a little sneaky. The error rate currently isn't
- // 'err', it's err * bloomCount. bloomCount == 2000/1000 == 2 in this case
- // So, the actual error rate should be roughly:
- // (keyInterval*2) * err * bloomCount
- // allow some tolerance
- System.out.println("False positives: " + falsePositives);
- assertTrue(falsePositives <= (keyInterval*5)*err);
- }
-
}