From 93d9f8dbc47e8a8345643ebef4ce335696badd40 Mon Sep 17 00:00:00 2001 From: Benoit Sigoure Date: Sun, 14 Mar 2010 03:06:21 -0700 Subject: [PATCH 3/3] HBASE-2323 Allow the client to specify a custom charset to convert the row key. --- .../hadoop/hbase/filter/RegexStringComparator.java | 38 ++++++++++++++++++- 1 files changed, 36 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/hadoop/hbase/filter/RegexStringComparator.java b/core/src/main/java/org/apache/hadoop/hbase/filter/RegexStringComparator.java index 6a75197..6c37898 100644 --- a/core/src/main/java/org/apache/hadoop/hbase/filter/RegexStringComparator.java +++ b/core/src/main/java/org/apache/hadoop/hbase/filter/RegexStringComparator.java @@ -19,11 +19,17 @@ */ package org.apache.hadoop.hbase.filter; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; import java.util.regex.Pattern; /** @@ -50,6 +56,10 @@ import java.util.regex.Pattern; */ public class RegexStringComparator extends WritableByteArrayComparable { + private static final Log LOG = LogFactory.getLog(RegexStringComparator.class); + + private Charset charset = Charset.forName(HConstants.UTF8_ENCODING); + private Pattern pattern; /** Nullary constructor for Writable, do not use */ @@ -64,23 +74,47 @@ public class RegexStringComparator extends WritableByteArrayComparable { this.pattern = Pattern.compile(expr, Pattern.DOTALL); } + /** + * Specifies the {@link Charset} to use to convert the row key to a String. + *

+ * The row key needs to be converted to a String in order to be matched + * against the regular expression. This method controls which charset is + * used to do this conversion. + *

+ * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1} + * is recommended. + * @param charset The charset to use. + */ + public void setCharset(final Charset charset) { + this.charset = charset; + } + @Override public int compareTo(byte[] value) { // Use find() for subsequence match instead of matches() (full sequence // match) to adhere to the principle of least surprise. - return pattern.matcher(Bytes.toString(value)).find() ? 0 : 1; + return pattern.matcher(new String(value, charset)).find() ? 0 : 1; } @Override public void readFields(DataInput in) throws IOException { - String expr = in.readUTF(); + final String expr = in.readUTF(); this.value = Bytes.toBytes(expr); this.pattern = Pattern.compile(expr); + final String charset = in.readUTF(); + if (charset.length() > 0) { + try { + this.charset = Charset.forName(charset); + } catch (IllegalCharsetNameException e) { + LOG.error("invalid charset", e); + } + } } @Override public void write(DataOutput out) throws IOException { out.writeUTF(pattern.toString()); + out.writeUTF(charset.name()); } } -- 1.7.0.2.201.g80978