Index: hbase-server/src/test/java/org/apache/hadoop/hbase/filter/TestFuzzyRowFilter.java =================================================================== --- hbase-server/src/test/java/org/apache/hadoop/hbase/filter/TestFuzzyRowFilter.java (revision 0) +++ hbase-server/src/test/java/org/apache/hadoop/hbase/filter/TestFuzzyRowFilter.java (revision 0) @@ -0,0 +1,194 @@ +/** + * Copyright 2010 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.filter; + +import org.junit.Assert; +import org.junit.Test; + +public class TestFuzzyRowFilter { + @Test + public void testSatisfies() { + Assert.assertEquals(FuzzyRowFilter.SatisfiesCode.NEXT_EXISTS, + FuzzyRowFilter.satisfies(new byte[]{1, (byte) -128, 0, 0, 1}, // row to check + new byte[]{1, 0, 1}, // fuzzy row + new byte[]{0, 1, 0})); // mask + + Assert.assertEquals(FuzzyRowFilter.SatisfiesCode.YES, + FuzzyRowFilter.satisfies(new byte[]{1, (byte) -128, 1, 0, 1}, + new byte[]{1, 0, 1}, + new byte[]{0, 1, 0})); + + Assert.assertEquals(FuzzyRowFilter.SatisfiesCode.NEXT_EXISTS, + FuzzyRowFilter.satisfies(new byte[]{1, (byte) -128, 2, 0, 1}, + new byte[]{1, 0, 1}, + new byte[]{0, 1, 0})); + + Assert.assertEquals(FuzzyRowFilter.SatisfiesCode.NO_NEXT, + FuzzyRowFilter.satisfies(new byte[]{2, 3, 1, 1, 1}, + new byte[]{1, 0, 1}, + new byte[]{0, 1, 0})); + + Assert.assertEquals(FuzzyRowFilter.SatisfiesCode.YES, + FuzzyRowFilter.satisfies(new byte[]{1, 2, 1, 3, 3}, + new byte[]{1, 2, 0, 3}, + new byte[]{0, 0, 1, 0})); + + Assert.assertEquals(FuzzyRowFilter.SatisfiesCode.NEXT_EXISTS, + FuzzyRowFilter.satisfies(new byte[]{1, 1, 1, 3, 0}, // row to check + new byte[]{1, 2, 0, 3}, // fuzzy row + new byte[]{0, 0, 1, 0})); // mask + + Assert.assertEquals(FuzzyRowFilter.SatisfiesCode.NEXT_EXISTS, + FuzzyRowFilter.satisfies(new byte[]{1, 1, 1, 3, 0}, + new byte[]{1, (byte) 245, 0, 3}, + new byte[]{0, 0, 1, 0})); + + Assert.assertEquals(FuzzyRowFilter.SatisfiesCode.NO_NEXT, + FuzzyRowFilter.satisfies(new byte[]{1, (byte) 245, 1, 3, 0}, + new byte[]{1, 1, 0, 3}, + new byte[]{0, 0, 1, 0})); + + Assert.assertEquals(FuzzyRowFilter.SatisfiesCode.NO_NEXT, + FuzzyRowFilter.satisfies(new byte[]{1, 3, 1, 3, 0}, + new byte[]{1, 2, 0, 3}, + new byte[]{0, 0, 1, 0})); + + Assert.assertEquals(FuzzyRowFilter.SatisfiesCode.NO_NEXT, + FuzzyRowFilter.satisfies(new byte[]{2, 1, 1, 1, 0}, + new byte[]{1, 2, 0, 3}, + new byte[]{0, 0, 1, 0})); + + Assert.assertEquals(FuzzyRowFilter.SatisfiesCode.NEXT_EXISTS, + FuzzyRowFilter.satisfies(new byte[]{1, 2, 1, 0, 1}, + new byte[]{0, 1, 2}, + new byte[]{1, 0, 0})); + } + + @Test + public void testGetNextForFuzzyRule() { + assertNext( + new byte[]{0, 1, 2}, // fuzzy row + new byte[]{1, 0, 0}, // mask + new byte[]{1, 2, 1, 0, 1}, // current + new byte[]{2, 1, 2, 0, 0}); // expected next + + assertNext( + new byte[]{0, 1, 2}, // fuzzy row + new byte[]{1, 0, 0}, // mask + new byte[]{1, 1, 2, 0, 1}, // current + new byte[]{1, 1, 2, 0, 2}); // expected next + + assertNext( + new byte[]{0, 1, 0, 2, 0}, // fuzzy row + new byte[]{1, 0, 1, 0, 1}, // mask + new byte[]{1, 0, 2, 0, 1}, // current + new byte[]{1, 1, 0, 2, 0}); // expected next + + assertNext( + new byte[]{1, 0, 1}, + new byte[]{0, 1, 0}, + new byte[]{1, (byte) 128, 2, 0, 1}, + new byte[]{1, (byte) 129, 1, 0, 0}); + + assertNext( + new byte[]{0, 1, 0, 1}, + new byte[]{1, 0, 1, 0}, + new byte[]{5, 1, 0, 1}, + new byte[]{5, 1, 1, 1}); + + assertNext( + new byte[]{0, 1, 0, 1}, + new byte[]{1, 0, 1, 0}, + new byte[]{5, 1, 0, 1, 1}, + new byte[]{5, 1, 0, 1, 2}); + + assertNext( + new byte[]{0, 1, 0, 0}, // fuzzy row + new byte[]{1, 0, 1, 1}, // mask + new byte[]{5, 1, (byte) 255, 1}, // current + new byte[]{5, 1, (byte) 255, 2}); // expected next + + assertNext( + new byte[]{0, 1, 0, 1}, // fuzzy row + new byte[]{1, 0, 1, 0}, // mask + new byte[]{5, 1, (byte) 255, 1}, // current + new byte[]{6, 1, 0, 1}); // expected next + + assertNext( + new byte[]{0, 1, 0, 1}, // fuzzy row + new byte[]{1, 0, 1, 0}, // mask + new byte[]{5, 1, (byte) 255, 0}, // current + new byte[]{5, 1, (byte) 255, 1}); // expected next + + assertNext( + new byte[]{5, 1, 1, 0}, + new byte[]{0, 0, 1, 1}, + new byte[]{5, 1, (byte) 255, 1}, + new byte[]{5, 1, (byte) 255, 2}); + + assertNext( + new byte[]{1, 1, 1, 1}, + new byte[]{0, 0, 1, 1}, + new byte[]{1, 1, 2, 2}, + new byte[]{1, 1, 2, 3}); + + assertNext( + new byte[]{1, 1, 1, 1}, + new byte[]{0, 0, 1, 1}, + new byte[]{1, 1, 3, 2}, + new byte[]{1, 1, 3, 3}); + + assertNext( + new byte[]{1, 1, 1, 1}, + new byte[]{1, 1, 1, 1}, + new byte[]{1, 1, 2, 3}, + new byte[]{1, 1, 2, 4}); + + assertNext( + new byte[]{1, 1, 1, 1}, + new byte[]{1, 1, 1, 1}, + new byte[]{1, 1, 3, 2}, + new byte[]{1, 1, 3, 3}); + + // No next for this one + Assert.assertNull(FuzzyRowFilter.getNextForFuzzyRule( + new byte[]{2, 3, 1, 1, 1}, // row to check + new byte[]{1, 0, 1}, // fuzzy row + new byte[]{0, 1, 0})); // mask + Assert.assertNull(FuzzyRowFilter.getNextForFuzzyRule( + new byte[]{1, (byte) 245, 1, 3, 0}, + new byte[]{1, 1, 0, 3}, + new byte[]{0, 0, 1, 0})); + Assert.assertNull(FuzzyRowFilter.getNextForFuzzyRule( + new byte[]{1, 3, 1, 3, 0}, + new byte[]{1, 2, 0, 3}, + new byte[]{0, 0, 1, 0})); + Assert.assertNull(FuzzyRowFilter.getNextForFuzzyRule( + new byte[]{2, 1, 1, 1, 0}, + new byte[]{1, 2, 0, 3}, + new byte[]{0, 0, 1, 0})); + } + + private void assertNext(byte[] fuzzyRow, byte[] mask, byte[] current, byte[] expected) { + byte[] nextForFuzzyRule = FuzzyRowFilter.getNextForFuzzyRule(current, fuzzyRow, mask); + Assert.assertArrayEquals(expected, nextForFuzzyRule); + } + +} Index: hbase-server/src/main/java/org/apache/hadoop/hbase/filter/FuzzyRowFilter.java =================================================================== --- hbase-server/src/main/java/org/apache/hadoop/hbase/filter/FuzzyRowFilter.java (revision 0) +++ hbase-server/src/main/java/org/apache/hadoop/hbase/filter/FuzzyRowFilter.java (revision 0) @@ -0,0 +1,283 @@ +/** + * Copyright 2010 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.filter; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.Pair; + +/** + * Filters data based on fuzzy row key. Performs fast-forwards during scanning. + * It takes pairs (row key, fuzzy info) to match row keys against. Where fuzzy info is + * a byte array with 0 or 1 as its values: + * + * + * + * Example: + * Let's assume row key format is userId_actionId_year_month. Length of userId is fixed + * and is 4, length of actionId is 2 and year and month are 4 and 2 bytes long respectively. + * + * Let's assume that we need to fetch all users that performed certain action (encoded as "99") + * in Jan of any year. Then the pair (row key, fuzzy info) would be the following: + * row key = "????_99_????_01" (one can use any value instead of "?") + * fuzzy info = "\x01\x01\x01\x01\x00\x00\x00\x00\x01\x01\x01\x01\x00\x00\x00" + * + * I.e. fuzzy info tells the the matching mask is "????_99_????_01", where at ? can be any value. + * + */ +public class FuzzyRowFilter extends FilterBase { + private List> fuzzyKeysData; + private boolean done = false; + + /** + * Used internally for reflection, do NOT use it directly + */ + public FuzzyRowFilter() { + } + + public FuzzyRowFilter(List> fuzzyKeysData) { + this.fuzzyKeysData = fuzzyKeysData; + } + + // TODO: possible improvement: save which fuzzy row key to use when providing a hint + @Override + public ReturnCode filterKeyValue(KeyValue kv) { + byte[] rowKey = kv.getRow(); + // assigning "worst" result first and looking for better options + SatisfiesCode bestOption = SatisfiesCode.NO_NEXT; + for (Pair fuzzyData : fuzzyKeysData) { + SatisfiesCode satisfiesCode = + satisfies(rowKey, fuzzyData.getFirst(), fuzzyData.getSecond()); + if (satisfiesCode == SatisfiesCode.YES) { + return ReturnCode.INCLUDE; + } + + if (satisfiesCode == SatisfiesCode.NEXT_EXISTS) { + bestOption = SatisfiesCode.NEXT_EXISTS; + } + } + + if (bestOption == SatisfiesCode.NEXT_EXISTS) { + return ReturnCode.SEEK_NEXT_USING_HINT; + } + + // the only not handled SatisfiesCode is NO_NEXT, i.e. we are done + done = true; + return ReturnCode.NEXT_ROW; + } + + @Override + public KeyValue getNextKeyHint(KeyValue currentKV) { + byte[] rowKey = currentKV.getRow(); + byte[] nextRowKey = null; + // Searching for the "smallest" row key that satisfies at least one fuzzy row key + for (Pair fuzzyData : fuzzyKeysData) { + byte[] nextRowKeyCandidate = getNextForFuzzyRule(rowKey, + fuzzyData.getFirst(), fuzzyData.getSecond()); + if (nextRowKeyCandidate == null) { + continue; + } + if (nextRowKey == null || Bytes.compareTo(nextRowKeyCandidate, nextRowKey) < 0) { + nextRowKey = nextRowKeyCandidate; + } + } + + if (nextRowKey == null) { + // SHOULD NEVER happen + // TODO: is there a better way than throw exception? (stop the scanner?) + throw new IllegalArgumentException("No next row key that satisfies fuzzy exists when" + + " getNextKeyHint() is invoked." + + " Filter: " + this.toString()); + } + + return KeyValue.createFirstOnRow(nextRowKey); + } + + @Override + public boolean filterAllRemaining() { + return done; + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + dataOutput.writeInt(this.fuzzyKeysData.size()); + for (Pair fuzzyData : fuzzyKeysData) { + Bytes.writeByteArray(dataOutput, fuzzyData.getFirst()); + Bytes.writeByteArray(dataOutput, fuzzyData.getSecond()); + } + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + int count = dataInput.readInt(); + this.fuzzyKeysData = new ArrayList>(count); + for (int i = 0; i < count; i++) { + byte[] keyBytes = Bytes.readByteArray(dataInput); + byte[] keyMeta = Bytes.readByteArray(dataInput); + this.fuzzyKeysData.add(new Pair(keyBytes, keyMeta)); + } + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + sb.append("FuzzyRowFilter"); + sb.append("{fuzzyKeysData="); + for (Pair fuzzyData : fuzzyKeysData) { + sb.append('{').append(Bytes.toStringBinary(fuzzyData.getFirst())).append(":"); + sb.append(Bytes.toStringBinary(fuzzyData.getSecond())).append('}'); + } + sb.append("}, "); + return sb.toString(); + } + + // Utility methods + + static enum SatisfiesCode { + YES, + NEXT_EXISTS, + NO_NEXT + } + + static SatisfiesCode satisfies(byte[] row, + byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) { + return satisfies(row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta); + } + + private static SatisfiesCode satisfies(byte[] row, int offset, int length, + byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) { + if (row == null) { + // do nothing, let scan to proceed + return SatisfiesCode.YES; + } + + boolean nextRowKeyCandidateExists = false; + + for (int i = 0; i < fuzzyKeyMeta.length && i < length; i++) { + // First, checking if this position is fixed and not equals the given one + boolean bytePositionFixed = fuzzyKeyMeta[i] == 0; + boolean fixedByteIncorrect = bytePositionFixed && fuzzyKeyBytes[i] != row[i + offset]; + if (fixedByteIncorrect) { + // in this case there's a other row that satisfies fuzzy rule and bigger than this row + if (nextRowKeyCandidateExists) { + return SatisfiesCode.NEXT_EXISTS; + } + + // If this row byte is less than fixed then there's a byte array bigger than + // this row and which satisfies the fuzzy rule. Otherwise there's no such byte array: + // this row is simply bigger than any byte array that satisfies the fuzzy rule + boolean rowByteLessThanFixed = (row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF); + return rowByteLessThanFixed ? SatisfiesCode.NEXT_EXISTS : SatisfiesCode.NO_NEXT; + } + + // Second, checking if this position is not fixed and byte value is not the biggest. In this + // case there's a byte array bigger than this row and which satisfies the fuzzy rule + if (fuzzyKeyMeta[i] == 1 && !isMax(fuzzyKeyBytes[i])) { + nextRowKeyCandidateExists = true; + } + } + + return SatisfiesCode.YES; + } + + private static boolean isMax(byte fuzzyKeyByte) { + return (fuzzyKeyByte & 0xFF) >= 255; + } + + static byte[] getNextForFuzzyRule(byte[] row, byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) { + return getNextForFuzzyRule(row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta); + } + + /** + * NOTE: returns null if next doesn't exist + */ + private static byte[] getNextForFuzzyRule(byte[] row, int offset, int length, + byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) { + // To find out the next "smallest" byte array that satisfies fuzzy rule and "greater" the given + // one we do the following: + // 1. setting values on all "fixed" positions to the values from fuzzyKeyBytes + // 2. if during the first step given row did not increase, then we increase the value at + // the first "non-fixed" position (where it is not maximum already) + + // It is easier to perform this by using fuzzyKeyBytes copy and setting "non-fixed" position + // values then otherwise. + byte[] result = Arrays.copyOf(fuzzyKeyBytes, + length > fuzzyKeyBytes.length ? length : fuzzyKeyBytes.length); + int toInc = -1; + + boolean increased = false; + for (int i = 0; i < result.length; i++) { + if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) { + // this is "non-fixed" position and is not at max value, hence we can increase it + result[i] = row[offset + i]; + if (!isMax(row[i])) { + toInc = i; + } + } else if (i <= fuzzyKeyMeta.length && fuzzyKeyMeta[i] == 0) { + if ((row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF)) { + // if setting value for any fixed position increased the original array, + // we are OK + increased = true; + break; + } + if ((row[i + offset] & 0xFF) > (fuzzyKeyBytes[i] & 0xFF)) { + // if setting value for any fixed position makes array "smaller", then just stop: + // in case we found some non-fixed position to increase we will do it, otherwise + // there's no "next" row key that satisfies fuzzy rule and "greater" than given row + break; + } + } + } + + if (!increased) { + if (toInc < 0) { + return null; + } + result[toInc]++; + + // Setting all "non-fixed" positions to zeroes to the right of the one we increased so + // that found "next" row key is the smallest possible + for (int i = toInc + 1; i < result.length; i++) { + if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) { + result[i] = 0; + } + } + } + + return result; + } + +}