diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyRegexPrefixRegionSplitPolicy.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyRegexPrefixRegionSplitPolicy.java new file mode 100644 index 0000000..c2185b8 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyRegexPrefixRegionSplitPolicy.java @@ -0,0 +1,95 @@ +/** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.regionserver; + +import java.io.UnsupportedEncodingException; +import java.util.Arrays; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * A custom RegionSplitPolicy implementing a SplitPolicy that groups rows by a prefix of the + * row-key. The prefix is chosen according to regular expression match. + * + * This ensures that a region is not split "inside" a prefix of a row key. I.e. rows can be + * co-located in a region by their prefix. + * + * As an example, if you have row keys formatted as salt_userid_enventtype_eventid, + * and you want to split rows between userids. This split policy can be + * represents as a regex like ^[^_]+_[^_]+_ (suppose all parts are non empty and + * does not contain '_'). For the regex string, ISO-8859-1 character set is used so any byte array + * can be supported, for example, ^[^\x00]+\x00[^\x00]+\x00 split after the second + * \x00 character. + */ +public class KeyRegexPrefixRegionSplitPolicy extends ConstantSizeRegionSplitPolicy { + private static final Log LOG = LogFactory.getLog(KeyPrefixRegionSplitPolicy.class); + public static final String PREFIX_REGEX_KEY = "prefix_split_key_policy.prefix_regex"; + + private Pattern prefixPattern = null; + + @Override + protected void configureForRegion(HRegion region) { + super.configureForRegion(region); + if (region != null) { + // read the prefix regex from the table descriptor + String prefixRegexString = region.getTableDesc().getValue(PREFIX_REGEX_KEY); + if (prefixRegexString == null) { + LOG.error(PREFIX_REGEX_KEY + " not specified for table " + + region.getTableDesc().getNameAsString() + + ". Using default RegionSplitPolicy"); + return; + } + try { + prefixPattern = Pattern.compile(prefixRegexString); + } catch (PatternSyntaxException pse) { + LOG.error("Invalid value for " + PREFIX_REGEX_KEY + " for table " + + region.getTableDesc().getNameAsString() + ":" + + prefixRegexString + ". Using default RegionSplitPolicy"); + } + } + } + + @Override + protected byte[] getSplitPoint() { + byte[] splitPoint = super.getSplitPoint(); + if (prefixPattern != null && splitPoint != null && splitPoint.length > 0) { + + try { + // ISO-8859-1 maps each possible byte to a char + String s = new String(splitPoint, "ISO-8859-1"); + Matcher m = prefixPattern.matcher(s); + if (m.find()) { + int prefixLength = m.group().length(); + // group split keys by a prefix + return Arrays.copyOf(splitPoint, + Math.min(prefixLength, splitPoint.length)); + } + } catch (UnsupportedEncodingException e) { + // ignore + } + } + + return splitPoint; + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionSplitPolicy.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionSplitPolicy.java index 924a196..859ff81 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionSplitPolicy.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionSplitPolicy.java @@ -288,4 +288,55 @@ public class TestRegionSplitPolicy { assertEquals("ijk", Bytes.toString(policy.getSplitPoint())); } + @Test + public void testKeyRegexPrefixRegionSplitPolicy() throws IOException { + String regex = "^[^\\x00]+\\x00[^\\x00]+\\x00"; + byte[][][] testDataPairs = { + { + { (byte) 'a', 0x00, (byte) 'b', 0x00, (byte) 'c' }, + { (byte) 'a', 0x00, (byte) 'b', 0x00 } + }, + { + { (byte)'a', 0x00, 0x00, (byte)'b' }, + { (byte)'a', 0x00, 0x00, (byte)'b' } + }, + { + { (byte)'a', (byte)'b' }, + { (byte)'a', (byte)'b' } + } + }; + + for (byte[][] pair : testDataPairs) { + byte[] key = pair[0]; + byte[] expected = pair[1]; + HTableDescriptor myHtd = new HTableDescriptor(); + myHtd.setValue(HTableDescriptor.SPLIT_POLICY, + KeyRegexPrefixRegionSplitPolicy.class.getName()); + myHtd.setValue(KeyRegexPrefixRegionSplitPolicy.PREFIX_REGEX_KEY, regex); + + HRegion myMockRegion = Mockito.mock(HRegion.class); + Mockito.doReturn(myHtd).when(myMockRegion).getTableDesc(); + Mockito.doReturn(stores).when(myMockRegion).getStores(); + + HStore mockStore = Mockito.mock(HStore.class); + Mockito.doReturn(2000L).when(mockStore).getSize(); + Mockito.doReturn(true).when(mockStore).canSplit(); + Mockito.doReturn(key).when(mockStore).getSplitPoint(); + stores.put(new byte[] { 1 }, mockStore); + + KeyRegexPrefixRegionSplitPolicy policy = + (KeyRegexPrefixRegionSplitPolicy) RegionSplitPolicy.create(myMockRegion, conf); + + assertTrue("unexpected split point", + Bytes.compareTo(expected, policy.getSplitPoint()) == 0); + + Mockito.doReturn(true).when(myMockRegion).shouldForceSplit(); + Mockito.doReturn(key).when(myMockRegion).getExplicitSplitPoint(); + + policy = (KeyRegexPrefixRegionSplitPolicy) RegionSplitPolicy.create(myMockRegion, conf); + + assertTrue("unexpected split point", + Bytes.compareTo(expected, policy.getSplitPoint()) == 0); + } + } }