diff --git a/be/src/exprs/string-functions.cc b/be/src/exprs/string-functions.cc index 445c4bc..b1362a4 100644 --- a/be/src/exprs/string-functions.cc +++ b/be/src/exprs/string-functions.cc @@ -382,8 +382,14 @@ StringVal StringFunctions::RegexpExtract(FunctionContext* context, const StringV } StringVal StringFunctions::RegexpReplace(FunctionContext* context, const StringVal& str, - const StringVal& pattern, const StringVal& replace) { - if (str.is_null || pattern.is_null || replace.is_null) return StringVal::null(); + const StringVal& pattern, const StringVal& replace, const BigIntVal& position, + const BigIntVal& occurrence) { + if (str.is_null || pattern.is_null || replace.is_null || position.is_null || occurrence.is_null) + return StringVal::null(); + int64_t fixed_pos = position.val; + // supported negative positions (count from the end of the string) + if (fixed_pos < 0) fixed_pos = str.len + fixed_pos + 1; + if (fixed_pos <= 0 || occurrence.val < 0) return str; re2::RE2* re = reinterpret_cast( context->GetFunctionState(FunctionContext::FRAGMENT_LOCAL)); @@ -402,10 +408,158 @@ StringVal StringFunctions::RegexpReplace(FunctionContext* context, const StringV re2::StringPiece replace_str = re2::StringPiece(reinterpret_cast(replace.ptr), replace.len); string result_str = AnyValUtil::ToString(str); - re2::RE2::GlobalReplace(&result_str, *re, replace_str); + // changed RE2::GlobalReplace method to accept additional arguments of position and occurrence + re2::RE2::GlobalReplace(&result_str, *re, replace_str, + static_cast(fixed_pos), static_cast(occurrence.val)); return AnyValUtil::FromString(context, result_str); } +StringVal StringFunctions::RegexpReplace(FunctionContext* context, const StringVal& str, + const StringVal& pattern, const StringVal& replace, const BigIntVal& position) { + return RegexpReplace(context, str, pattern, replace, position, BigIntVal(0)); +} + +StringVal StringFunctions::RegexpReplace(FunctionContext* context, const StringVal& str, + const StringVal& pattern, const StringVal& replace) { + return RegexpReplace(context, str, pattern, replace, BigIntVal(1), BigIntVal(0)); +} + +StringVal StringFunctions::RegexpSubstr(FunctionContext* context, const StringVal& str, + const StringVal& pattern, const BigIntVal& position, const BigIntVal& occurrence) { + if (str.is_null || pattern.is_null || position.is_null || occurrence.is_null) + return StringVal::null(); + int64_t fixed_pos = position.val; + // supported negative positions (count from the end of the string) + if (fixed_pos < 0) fixed_pos = str.len + fixed_pos + 1; + if (fixed_pos <= 0 || occurrence.val <= 0) return StringVal(); + + re2::RE2* re = reinterpret_cast( + context->GetFunctionState(FunctionContext::FRAGMENT_LOCAL)); + scoped_ptr scoped_re; // destroys re if state->re is NULL + if (re == NULL) { + DCHECK(!context->IsArgConstant(1)); + string error_str; + re = CompileRegex(pattern, &error_str); + if (re == NULL) { + context->AddWarning(error_str.c_str()); + return StringVal::null(); + } + scoped_re.reset(re); + } + + re2::StringPiece str_sp = + re2::StringPiece(reinterpret_cast(str.ptr + fixed_pos - 1), + str.len - fixed_pos + 1); + for (int64_t i = 1; i < occurrence.val; i++) { + if (!re2::RE2::FindAndConsume(&str_sp, *re)) return StringVal(); + } + + string result_str; + bool success = re2::RE2::FindAndConsume(&str_sp, *re, &result_str); + if (!success) return StringVal(); + return AnyValUtil::FromString(context, result_str); +} + +StringVal StringFunctions::RegexpSubstr(FunctionContext* context, const StringVal& str, + const StringVal& pattern, const BigIntVal& position) { + return RegexpSubstr(context, str, pattern, position, BigIntVal(1)); +} + +StringVal StringFunctions::RegexpSubstr(FunctionContext* context, const StringVal& str, + const StringVal& pattern) { + return RegexpSubstr(context, str, pattern, BigIntVal(1), BigIntVal(1)); +} + +IntVal StringFunctions::RegexpInstr(FunctionContext* context, const StringVal& str, + const StringVal& pattern, const BigIntVal& position, + const BigIntVal& occurrence, const TinyIntVal& return_opt) { + if (str.is_null || pattern.is_null || position.is_null || occurrence.is_null || + return_opt.is_null || (return_opt.val != 0 && return_opt.val != 1)) + return IntVal::null(); + int64_t fixed_pos = position.val; + // supported negative positions (count from the end of the string) + if (fixed_pos < 0) fixed_pos = str.len + fixed_pos + 1; + if (fixed_pos <= 0 || occurrence.val <= 0) return IntVal(0); + + re2::RE2* re = reinterpret_cast( + context->GetFunctionState(FunctionContext::FRAGMENT_LOCAL)); + scoped_ptr scoped_re; // destroys re if state->re is NULL + if (re == NULL) { + DCHECK(!context->IsArgConstant(1)); + string error_str; + re = CompileRegex(pattern, &error_str); + if (re == NULL) { + context->AddWarning(error_str.c_str()); + return IntVal::null(); + } + scoped_re.reset(re); + } + + re2::StringPiece str_sp = + re2::StringPiece(reinterpret_cast(str.ptr + fixed_pos - 1), + str.len - fixed_pos + 1); + int32_t consumed; + int64_t total_consumed = fixed_pos - 1; + for (int64_t i = 1; i < occurrence.val; i++) { + // changed RE2::DoMatch method from private to public + if (!re->DoMatch(str_sp, RE2::UNANCHORED, &consumed, 0, 0)) return IntVal(0); + str_sp.remove_prefix(consumed); + total_consumed += consumed; + } + string match_str; + re2::RE2::Arg argv = &match_str; + const re2::RE2::Arg* const args[] = { &argv }; + if (!re->DoMatch(str_sp, RE2::UNANCHORED, &consumed, args, 1)) return IntVal(0); + total_consumed += consumed; + if (return_opt.val == 0) { + return IntVal(total_consumed - match_str.length() + 1); + } else { // return_opt.val == 1 + return IntVal(total_consumed + 1); + } +} + +IntVal StringFunctions::RegexpInstr(FunctionContext* context, const StringVal& str, + const StringVal& pattern, const BigIntVal& position, const BigIntVal& occurrence) { + return RegexpInstr(context, str, pattern, position, occurrence, TinyIntVal(0)); +} + +IntVal StringFunctions::RegexpInstr(FunctionContext* context, const StringVal& str, + const StringVal& pattern, const BigIntVal& position) { + return RegexpInstr(context, str, pattern, position, BigIntVal(1), TinyIntVal(0)); +} + +IntVal StringFunctions::RegexpInstr(FunctionContext* context, const StringVal& str, + const StringVal& pattern) { + return RegexpInstr(context, str, pattern, BigIntVal(1), BigIntVal(1), TinyIntVal(0)); +} + +IntVal StringFunctions::RegexpSimilar(FunctionContext* context, const StringVal& str, + const StringVal& pattern) { + if (str.is_null || pattern.is_null) return IntVal::null(); + + re2::RE2* re = reinterpret_cast( + context->GetFunctionState(FunctionContext::FRAGMENT_LOCAL)); + scoped_ptr scoped_re; // destroys re if state->re is NULL + if (re == NULL) { + DCHECK(!context->IsArgConstant(1)); + string error_str; + re = CompileRegex(pattern, &error_str); + if (re == NULL) { + context->AddWarning(error_str.c_str()); + return IntVal::null(); + } + scoped_re.reset(re); + } + + re2::StringPiece str_sp = + re2::StringPiece(reinterpret_cast(str.ptr), str.len); + if (re2::RE2::FullMatch(str_sp, *re)) { + return IntVal(1); + } else { + return IntVal(0); + } +} + StringVal StringFunctions::Concat(FunctionContext* context, int num_children, const StringVal* strs) { return ConcatWs(context, StringVal(), num_children, strs); diff --git a/be/src/exprs/string-functions.h b/be/src/exprs/string-functions.h index e3d6de2..fefcdd2 100644 --- a/be/src/exprs/string-functions.h +++ b/be/src/exprs/string-functions.h @@ -63,6 +63,30 @@ class StringFunctions { const StringVal& pattern, const BigIntVal& index); static StringVal RegexpReplace(FunctionContext*, const StringVal& str, const StringVal& pattern, const StringVal& replace); + static StringVal RegexpReplace(FunctionContext*, const StringVal& str, + const StringVal& pattern, const StringVal& replace, + const BigIntVal& position); + static StringVal RegexpReplace(FunctionContext*, const StringVal& str, + const StringVal& pattern, const StringVal& replace, + const BigIntVal& position, const BigIntVal& occurrence); + static StringVal RegexpSubstr(FunctionContext*, const StringVal& str, + const StringVal& pattern); + static StringVal RegexpSubstr(FunctionContext*, const StringVal& str, + const StringVal& pattern, const BigIntVal& position); + static StringVal RegexpSubstr(FunctionContext*, const StringVal& str, + const StringVal& pattern, const BigIntVal& position, + const BigIntVal& occurrence); + static IntVal RegexpInstr(FunctionContext*, const StringVal& str, + const StringVal& pattern); + static IntVal RegexpInstr(FunctionContext*, const StringVal& str, + const StringVal& pattern, const BigIntVal& position); + static IntVal RegexpInstr(FunctionContext*, const StringVal& str, + const StringVal& pattern, const BigIntVal& position, + const BigIntVal& occurrence); + static IntVal RegexpInstr(FunctionContext*, const StringVal& str, + const StringVal& pattern, const BigIntVal& position, + const BigIntVal& occurrence, const TinyIntVal& return_opt); + static IntVal RegexpSimilar(FunctionContext*, const StringVal& str, const StringVal& pattern); static StringVal Concat(FunctionContext*, int num_children, const StringVal* strs); static StringVal ConcatWs(FunctionContext*, const StringVal& sep, int num_children, diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py index 094d040..795f498 100755 --- a/common/function-registry/impala_functions.py +++ b/common/function-registry/impala_functions.py @@ -398,6 +398,46 @@ functions = [ 'impala::StringFunctions::RegexpReplace', '_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', '_ZN6impala15StringFunctions11RegexpCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], + [['regexp_replace'], 'STRING', ['STRING', 'STRING', 'STRING', 'BIGINT'], + 'impala::StringFunctions::RegexpReplace', + '_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', + '_ZN6impala15StringFunctions11RegexpCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], + [['regexp_replace'], 'STRING', ['STRING', 'STRING', 'STRING', 'BIGINT', 'BIGINT'], + 'impala::StringFunctions::RegexpReplace', + '_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', + '_ZN6impala15StringFunctions11RegexpCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], + [['regexp_substr'], 'STRING', ['STRING', 'STRING'], + 'impala::StringFunctions::RegexpSubstr', + '_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', + '_ZN6impala15StringFunctions11RegexpCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], + [['regexp_substr'], 'STRING', ['STRING', 'STRING', 'BIGINT'], + 'impala::StringFunctions::RegexpSubstr', + '_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', + '_ZN6impala15StringFunctions11RegexpCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], + [['regexp_substr'], 'STRING', ['STRING', 'STRING', 'BIGINT', 'BIGINT'], + 'impala::StringFunctions::RegexpSubstr', + '_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', + '_ZN6impala15StringFunctions11RegexpCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], + [['regexp_instr'], 'INT', ['STRING', 'STRING'], + 'impala::StringFunctions::RegexpInstr', + '_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', + '_ZN6impala15StringFunctions11RegexpCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], + [['regexp_instr'], 'INT', ['STRING', 'STRING', 'BIGINT'], + 'impala::StringFunctions::RegexpInstr', + '_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', + '_ZN6impala15StringFunctions11RegexpCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], + [['regexp_instr'], 'INT', ['STRING', 'STRING', 'BIGINT', 'BIGINT'], + 'impala::StringFunctions::RegexpInstr', + '_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', + '_ZN6impala15StringFunctions11RegexpCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], + [['regexp_instr'], 'INT', ['STRING', 'STRING', 'BIGINT', 'BIGINT', 'TINYINT'], + 'impala::StringFunctions::RegexpInstr', + '_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', + '_ZN6impala15StringFunctions11RegexpCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], + [['regexp_similar'], 'INT', ['STRING', 'STRING'], + 'impala::StringFunctions::RegexpSimilar', + '_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', + '_ZN6impala15StringFunctions11RegexpCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], [['concat'], 'STRING', ['STRING', '...'], 'impala::StringFunctions::Concat'], [['concat_ws'], 'STRING', ['STRING', 'STRING', '...'], 'impala::StringFunctions::ConcatWs'], diff --git a/thirdparty/re2/re2/re2.cc b/thirdparty/re2/re2/re2.cc index 8d1d468..394bde4 100644 --- a/thirdparty/re2/re2/re2.cc +++ b/thirdparty/re2/re2/re2.cc @@ -368,16 +368,29 @@ bool RE2::Replace(string *str, int RE2::GlobalReplace(string *str, const RE2& re, - const StringPiece& rewrite) { + const StringPiece& rewrite, + int position, + int occurrence) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); if (nvec > arraysize(vec)) return false; + if (position <= 0) + position = 1; const char* p = str->data(); const char* ep = p + str->size(); const char* lastend = NULL; string out; + if (1 < position) { + if (position <= str->size()) { + out.append(p, position - 1); + p += position - 1; + lastend = p; + } else { + return 0; + } + } int count = 0; while (p <= ep) { if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec)) @@ -388,13 +401,31 @@ int RE2::GlobalReplace(string *str, // Disallow empty match at end of last match: skip ahead. if (p < ep) out.append(p, 1); - p++; + ++p; continue; } - re.Rewrite(&out, rewrite, vec, nvec); + if (occurrence == 0) { + re.Rewrite(&out, rewrite, vec, nvec); + } else { + if (0 < occurrence) { + if (count == occurrence - 1) { + re.Rewrite(&out, rewrite, vec, nvec); + p = vec[0].end(); + lastend = p; + ++count; + break; + } else if (count < occurrence - 1) { + out.append(vec[0].data(), vec[0].length()); + } else { // if count > occurrence - 1 + break; + } + } else { // if 0 > occurrence + break; + } + } p = vec[0].end(); lastend = p; - count++; + ++count; } if (count == 0) diff --git a/thirdparty/re2/re2/re2.h b/thirdparty/re2/re2/re2.h index 272028b..af17c45 100644 --- a/thirdparty/re2/re2/re2.h +++ b/thirdparty/re2/re2/re2.h @@ -386,7 +386,9 @@ class RE2 { // Returns the number of replacements made. static int GlobalReplace(string *str, const RE2& pattern, - const StringPiece& rewrite); + const StringPiece& rewrite, + int position, + int occurrence); // Like Replace, except that if the pattern matches, "rewrite" // is copied into "out" with substitutions. The non-matching @@ -690,15 +692,13 @@ class RE2 { static inline Arg Octal(long long* x); static inline Arg Octal(unsigned long long* x); - private: - void Init(const StringPiece& pattern, const Options& options); - bool DoMatch(const StringPiece& text, Anchor anchor, int* consumed, const Arg* const args[], int n) const; - + private: + void Init(const StringPiece& pattern, const Options& options); re2::Prog* ReverseProg() const; mutable Mutex* mutex_; diff --git a/thirdparty/re2/re2/testing/re2_test.cc b/thirdparty/re2/re2/testing/re2_test.cc index b99cacf..7f68099 100644 --- a/thirdparty/re2/re2/testing/re2_test.cc +++ b/thirdparty/re2/re2/testing/re2_test.cc @@ -179,7 +179,71 @@ TEST(RE2, Replace) { CHECK(RE2::Replace(&one, t->regexp, t->rewrite)); CHECK_EQ(one, t->single); string all(t->original); - CHECK_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count) + CHECK_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite, 1, 0), t->greplace_count) + << "Got: " << all; + CHECK_EQ(all, t->global); + } +} + +TEST(RE2, Replace2) { + VLOG(1) << "TestReplace2"; + + struct ReplaceTest { + const char *regexp; + const char *rewrite; + const char *original; + const char *single; + const char *global; + int greplace_count; + }; + static const ReplaceTest tests[] = { + { "b", + "bb", + "ababababab", + "abbabababab", + "ababababbab", + 2 }, + { "b", + "bb", + "bbbbbb", + "bbbbbbb", + "bbbbbbb", + 2 }, + { "b+", + "bb", + "bbbbbb", + "bb", + "bbbbbb", + 1 }, + { "b*", + "bb", + "bbbbbb", + "bb", + "bbbbbb", + 1 }, + { "b*", + "bb", + "aaaaa", + "bbaaaaa", + "aaaaa", + 1 }, + // Check newline handling + { "a.*a", + "(\\0)", + "aba\naba\naba", + "(aba)\naba\naba", + "aba\naba\n(aba)", + 2 }, + { "", NULL, NULL, NULL, NULL, 0 } + }; + + for (const ReplaceTest *t = tests; t->original != NULL; ++t) { + VLOG(1) << StringPrintf("\"%s\" =~ s/%s/%s/g", t->original, t->regexp, t->rewrite); + string one(t->original); + CHECK(RE2::Replace(&one, t->regexp, t->rewrite)); + CHECK_EQ(one, t->single); + string all(t->original); + CHECK_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite, 5, 2), t->greplace_count) << "Got: " << all; CHECK_EQ(all, t->global); } @@ -289,6 +353,29 @@ TEST(RE2, FindAndConsume) { CHECK_EQ(input, ""); } +TEST(RE2, DoMatch) { + VLOG(1) << "DoMatch"; + + RE2 r("(\\w+thy)"); // matches a word + string word; + string s("healthy, wealthy, and wise"); + StringPiece input(s); + + int consumed; + RE2::Arg argv; + argv = &word; + const RE2::Arg* const args[] = { &argv }; + if (r.DoMatch(input, RE2::UNANCHORED, &consumed, args, 1)) { + input.remove_prefix(consumed); + } + CHECK_EQ(word, "healthy"); + if (r.DoMatch(input, RE2::UNANCHORED, &consumed, args, 1)) { + input.remove_prefix(consumed); + } + CHECK_EQ(word, "wealthy"); + CHECK(!r.DoMatch(input, RE2::UNANCHORED, &consumed, args, 1)); +} + TEST(RE2, FindAndConsumeN) { const string s(" one two three 4"); StringPiece input(s);