FAQ
The built-in function regexp_extract() of Impala returns different result
compared with Hive.

It seems Impala does not return the longest of the possible matches.


*reproduce*
Impala> select regexp_extract('I001=-200,I003=-210,I007=0', 'I001=-?[0-9]+',
0)

I001=-2


hive> select regexp_extract('I001=-200,I003=-210,I007=0', 'I001=-?[0-9]+',
0) from tmp

I001=-200



To return leftmost-longest matched string, the fifth argument to
boost::regex_search should be regex_constants::match_posix or
regex_constants::match_default.

Is this a bug?


< be/src/exprs/string-functions.cc >

372 void* StringFunctions::RegexpExtract(Expr* e, TupleRow* row) {
373 DCHECK_EQ(e->GetNumChildren(), 3);
374 StringValue* str = reinterpret_cast<StringValue*>(e->children()[0]
->GetValue(row));
375 StringValue* pattern = reinterpret_cast<StringValue*>(e->children()[1]
->GetValue(row));
376 int32_t* index = reinterpret_cast<int32_t*>(e->children()[2]
->GetValue(row));
377 if (str == NULL || pattern == NULL || index == NULL) return NULL;
378 FunctionCall* func_expr = static_cast<FunctionCall*>(e);
379 // Compile the regex if pattern is not constant,
380 // or if pattern is constant and this is the first function invocation.
381 if ((!e->children()[1]->IsConstant()) ||
382 (e->children()[1]>IsConstant() && func_expr>GetRegex() == NULL)) {
383 string pattern_str(pattern->ptr, pattern->len);
384 bool valid_pattern = func_expr->SetRegex(pattern_str);
385 // Hive throws an exception for invalid patterns.
386 if (!valid_pattern) { 387 return NULL; 388 }
389 }
390 DCHECK(func_expr->GetRegex() != NULL);
391 cmatch matches;
392 // cast's are necessary to make boost understand which function we want.
393 bool success = regex_search(const_cast<const char*>(str->ptr),
394 const_cast<const char*>(str->ptr) + str->len,
395 matches, *func_expr->GetRegex(), regex_constants::match_any); // <==
should change constants::match_any to constants::match_posix
396 if (!success) { 397 e->result_.SetStringVal(""); 398 return
&e->result_.string_val; 399 }
400 // match[0] is the whole string, match_res.str(1) the first group, etc.
401 e->result_.SetStringVal(matches[*index]);
402 return &e->result_.string_val;

403 }

Search Discussions

Related Discussions

Discussion Navigation
viewthread | post
Discussion Overview
groupimpala-user @
categorieshadoop
postedApr 11, '13 at 9:31a
activeApr 11, '13 at 9:32a
posts2
users1
websitecloudera.com
irc#hadoop

1 user in discussion

Jung-Yup Lee: 2 posts

People

Translate

site design / logo © 2022 Grokbase