From 7469f11f2e91c3a033cf03b3ff5c9624a9175452 Mon Sep 17 00:00:00 2001 From: Benoit Sigoure Date: Sun, 11 Jul 2010 22:23:51 -0700 Subject: [PATCH] HBASE-2323 filter.RegexStringComparator does not work with certain bytes --- CHANGES.txt | 2 + .../hadoop/hbase/filter/RegexStringComparator.java | 50 +++++++++++++++++--- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index ad5be9c..089352b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -5,6 +5,8 @@ Release 0.20.6 - Unreleased HBASE-2772 Scan doesn't recover from region server failure HBASE-2786 TestHLog.testSplit hangs HBASE-2797 Another NPE in ReadWriteConsistencyControl + HBASE-2323 filter.RegexStringComparator does not work with certain bytes + (Benoit Sigoure via Stack) Release 0.20.5 - Unreleased diff --git a/src/java/org/apache/hadoop/hbase/filter/RegexStringComparator.java b/src/java/org/apache/hadoop/hbase/filter/RegexStringComparator.java index e869e0e..a096412 100644 --- a/src/java/org/apache/hadoop/hbase/filter/RegexStringComparator.java +++ b/src/java/org/apache/hadoop/hbase/filter/RegexStringComparator.java @@ -19,21 +19,29 @@ */ package org.apache.hadoop.hbase.filter; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.util.Bytes; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; import java.util.regex.Pattern; import org.apache.hadoop.hbase.util.Bytes; /** - * This comparator is for use with {@link CompareFilter} implementations, such - * as {@link RowFilter}, {@link QualifierFilter}, and {@link ValueFilter}, for - * filtering based on the value of a given column. Use it to test if a given + * This comparator is for use with {@link CompareFilter} implementations, such + * as {@link RowFilter}, {@link QualifierFilter}, and {@link ValueFilter}, for + * filtering based on the value of a given column. Use it to test if a given * regular expression matches a cell value in the column. *

* Only EQUAL or NOT_EQUAL {@link org.apache.hadoop.hbase.filter.CompareFilter.CompareOp} - * comparisons are valid with this comparator. + * comparisons are valid with this comparator. *

* For example: *

@@ -51,6 +59,10 @@ import org.apache.hadoop.hbase.util.Bytes; */ public class RegexStringComparator extends WritableByteArrayComparable { + private static final Log LOG = LogFactory.getLog(RegexStringComparator.class); + + private Charset charset = Charset.forName(HConstants.UTF8_ENCODING); + private Pattern pattern; /** Nullary constructor for Writable, do not use */ @@ -62,26 +74,50 @@ public class RegexStringComparator extends WritableByteArrayComparable { */ public RegexStringComparator(String expr) { super(Bytes.toBytes(expr)); - this.pattern = Pattern.compile(expr); + this.pattern = Pattern.compile(expr, Pattern.DOTALL); + } + + /** + * Specifies the {@link Charset} to use to convert the row key to a String. + *

+ * The row key needs to be converted to a String in order to be matched + * against the regular expression. This method controls which charset is + * used to do this conversion. + *

+ * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1} + * is recommended. + * @param charset The charset to use. + */ + public void setCharset(final Charset charset) { + this.charset = charset; } @Override public int compareTo(byte[] value) { // Use find() for subsequence match instead of matches() (full sequence // match) to adhere to the principle of least surprise. - return pattern.matcher(Bytes.toString(value)).find() ? 0 : 1; + return pattern.matcher(new String(value, charset)).find() ? 0 : 1; } @Override public void readFields(DataInput in) throws IOException { - String expr = in.readUTF(); + final String expr = in.readUTF(); this.value = Bytes.toBytes(expr); this.pattern = Pattern.compile(expr); + final String charset = in.readUTF(); + if (charset.length() > 0) { + try { + this.charset = Charset.forName(charset); + } catch (IllegalCharsetNameException e) { + LOG.error("invalid charset", e); + } + } } @Override public void write(DataOutput out) throws IOException { out.writeUTF(pattern.toString()); + out.writeUTF(charset.name()); } } -- 1.7.2.rc2.1.g9dcc2