diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyRegexPrefixRegionSplitPolicy.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyRegexPrefixRegionSplitPolicy.java
new file mode 100644
index 0000000..c2185b8
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyRegexPrefixRegionSplitPolicy.java
@@ -0,0 +1,95 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.regionserver;
+
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * A custom RegionSplitPolicy implementing a SplitPolicy that groups rows by a prefix of the
+ * row-key. The prefix is chosen according to regular expression match.
+ *
+ * This ensures that a region is not split "inside" a prefix of a row key. I.e. rows can be
+ * co-located in a region by their prefix.
+ *
+ * As an example, if you have row keys formatted as salt_userid_enventtype_eventid,
+ * and you want to split rows between userids. This split policy can be
+ * represents as a regex like ^[^_]+_[^_]+_ (suppose all parts are non empty and
+ * does not contain '_'). For the regex string, ISO-8859-1 character set is used so any byte array
+ * can be supported, for example, ^[^\x00]+\x00[^\x00]+\x00 split after the second
+ * \x00 character.
+ */
+public class KeyRegexPrefixRegionSplitPolicy extends ConstantSizeRegionSplitPolicy {
+ private static final Log LOG = LogFactory.getLog(KeyPrefixRegionSplitPolicy.class);
+ public static final String PREFIX_REGEX_KEY = "prefix_split_key_policy.prefix_regex";
+
+ private Pattern prefixPattern = null;
+
+ @Override
+ protected void configureForRegion(HRegion region) {
+ super.configureForRegion(region);
+ if (region != null) {
+ // read the prefix regex from the table descriptor
+ String prefixRegexString = region.getTableDesc().getValue(PREFIX_REGEX_KEY);
+ if (prefixRegexString == null) {
+ LOG.error(PREFIX_REGEX_KEY + " not specified for table "
+ + region.getTableDesc().getNameAsString()
+ + ". Using default RegionSplitPolicy");
+ return;
+ }
+ try {
+ prefixPattern = Pattern.compile(prefixRegexString);
+ } catch (PatternSyntaxException pse) {
+ LOG.error("Invalid value for " + PREFIX_REGEX_KEY + " for table "
+ + region.getTableDesc().getNameAsString() + ":"
+ + prefixRegexString + ". Using default RegionSplitPolicy");
+ }
+ }
+ }
+
+ @Override
+ protected byte[] getSplitPoint() {
+ byte[] splitPoint = super.getSplitPoint();
+ if (prefixPattern != null && splitPoint != null && splitPoint.length > 0) {
+
+ try {
+ // ISO-8859-1 maps each possible byte to a char
+ String s = new String(splitPoint, "ISO-8859-1");
+ Matcher m = prefixPattern.matcher(s);
+ if (m.find()) {
+ int prefixLength = m.group().length();
+ // group split keys by a prefix
+ return Arrays.copyOf(splitPoint,
+ Math.min(prefixLength, splitPoint.length));
+ }
+ } catch (UnsupportedEncodingException e) {
+ // ignore
+ }
+ }
+
+ return splitPoint;
+ }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionSplitPolicy.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionSplitPolicy.java
index 924a196..859ff81 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionSplitPolicy.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionSplitPolicy.java
@@ -288,4 +288,55 @@ public class TestRegionSplitPolicy {
assertEquals("ijk", Bytes.toString(policy.getSplitPoint()));
}
+ @Test
+ public void testKeyRegexPrefixRegionSplitPolicy() throws IOException {
+ String regex = "^[^\\x00]+\\x00[^\\x00]+\\x00";
+ byte[][][] testDataPairs = {
+ {
+ { (byte) 'a', 0x00, (byte) 'b', 0x00, (byte) 'c' },
+ { (byte) 'a', 0x00, (byte) 'b', 0x00 }
+ },
+ {
+ { (byte)'a', 0x00, 0x00, (byte)'b' },
+ { (byte)'a', 0x00, 0x00, (byte)'b' }
+ },
+ {
+ { (byte)'a', (byte)'b' },
+ { (byte)'a', (byte)'b' }
+ }
+ };
+
+ for (byte[][] pair : testDataPairs) {
+ byte[] key = pair[0];
+ byte[] expected = pair[1];
+ HTableDescriptor myHtd = new HTableDescriptor();
+ myHtd.setValue(HTableDescriptor.SPLIT_POLICY,
+ KeyRegexPrefixRegionSplitPolicy.class.getName());
+ myHtd.setValue(KeyRegexPrefixRegionSplitPolicy.PREFIX_REGEX_KEY, regex);
+
+ HRegion myMockRegion = Mockito.mock(HRegion.class);
+ Mockito.doReturn(myHtd).when(myMockRegion).getTableDesc();
+ Mockito.doReturn(stores).when(myMockRegion).getStores();
+
+ HStore mockStore = Mockito.mock(HStore.class);
+ Mockito.doReturn(2000L).when(mockStore).getSize();
+ Mockito.doReturn(true).when(mockStore).canSplit();
+ Mockito.doReturn(key).when(mockStore).getSplitPoint();
+ stores.put(new byte[] { 1 }, mockStore);
+
+ KeyRegexPrefixRegionSplitPolicy policy =
+ (KeyRegexPrefixRegionSplitPolicy) RegionSplitPolicy.create(myMockRegion, conf);
+
+ assertTrue("unexpected split point",
+ Bytes.compareTo(expected, policy.getSplitPoint()) == 0);
+
+ Mockito.doReturn(true).when(myMockRegion).shouldForceSplit();
+ Mockito.doReturn(key).when(myMockRegion).getExplicitSplitPoint();
+
+ policy = (KeyRegexPrefixRegionSplitPolicy) RegionSplitPolicy.create(myMockRegion, conf);
+
+ assertTrue("unexpected split point",
+ Bytes.compareTo(expected, policy.getSplitPoint()) == 0);
+ }
+ }
}