Grokbase Groups Hive commits May 2015
FAQ
HIVE-10802 : Table join query with some constant field in select fails (Aihua Xu via Ashutosh Chauhan)

Signed-off-by: Ashutosh Chauhan <hashutosh@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/66acd266
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/66acd266
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/66acd266

Branch: refs/heads/master
Commit: 66acd26670e417a3b7c7e0abee3e94b71571e0e3
Parents: 35e91f2
Author: Aihua Xu <aihuaxu@gmail.com>
Authored: Sat May 30 15:13:00 2015 -0700
Committer: Ashutosh Chauhan <hashutosh@apache.org>
Committed: Sun May 31 00:49:46 2015 -0700

----------------------------------------------------------------------
  .../optimizer/ConstantPropagateProcFactory.java | 57 +++++----
  .../clientpositive/udf_nondeterministic.q | 11 ++
  .../clientpositive/spark/udf_in_file.q.out | 2 +-
  .../results/clientpositive/udf_in_file.q.out | 2 +-
  .../clientpositive/udf_nondeterministic.q.out | 120 +++++++++++++++++++
  5 files changed, 164 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/66acd266/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java
index e66de1a..4a4814d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java
@@ -196,9 +196,13 @@ public final class ConstantPropagateProcFactory {
    /**
     * Fold input expression desc.
     *
- * If desc is a UDF and all parameters are constants, evaluate it. If desc is a column expression,
- * find it from propagated constants, and if there is, replace it with constant.
- *
+ * This function recursively checks if any subexpression of a specified expression
+ * can be evaluated to be constant and replaces such subexpression with the constant.
+ * If the expression is a derterministic UDF and all the subexpressions are constants,
+ * the value will be calculated immediately (during compilation time vs. runtime).
+ * e.g.:
+ * concat(year, month) => 200112 for year=2001, month=12 since concat is deterministic UDF
+ * unix_timestamp(time) => unix_timestamp(123) for time=123 since unix_timestamp is nonderministic UDF
     * @param desc folding expression
     * @param constants current propagated constant map
     * @param cppCtx
@@ -213,12 +217,7 @@ public final class ConstantPropagateProcFactory {
      if (desc instanceof ExprNodeGenericFuncDesc) {
        ExprNodeGenericFuncDesc funcDesc = (ExprNodeGenericFuncDesc) desc;

- // The function must be deterministic, or we can't fold it.
        GenericUDF udf = funcDesc.getGenericUDF();
- if (!isDeterministicUdf(udf)) {
- LOG.debug("Function " + udf.getClass() + " undeterministic, quit folding.");
- return desc;
- }

        boolean propagateNext = propagate && propagatableUdfs.contains(udf.getClass());
        List<ExprNodeDesc> newExprs = new ArrayList<ExprNodeDesc>();
@@ -226,27 +225,33 @@ public final class ConstantPropagateProcFactory {
          newExprs.add(foldExpr(childExpr, constants, cppCtx, op, tag, propagateNext));
        }

- // If all child expressions are constants, evaluate UDF immediately
- ExprNodeDesc constant = evaluateFunction(udf, newExprs, desc.getChildren());
- if (constant != null) {
- LOG.debug("Folding expression:" + desc + " -> " + constant);
- return constant;
+ // Don't evalulate nondeterministic function since the value can only calculate during runtime.
+ if (!isDeterministicUdf(udf)) {
+ LOG.debug("Function " + udf.getClass() + " is undeterministic. Don't evalulating immediately.");
+ ((ExprNodeGenericFuncDesc) desc).setChildren(newExprs);
+ return desc;
        } else {
-
- // Check if the function can be short cut.
- ExprNodeDesc shortcut = shortcutFunction(udf, newExprs, op);
- if (shortcut != null) {
- LOG.debug("Folding expression:" + desc + " -> " + shortcut);
- return shortcut;
+ // If all child expressions of deterministic function are constants, evaluate such UDF immediately
+ ExprNodeDesc constant = evaluateFunction(udf, newExprs, desc.getChildren());
+ if (constant != null) {
+ LOG.debug("Folding expression:" + desc + " -> " + constant);
+ return constant;
+ } else {
+ // Check if the function can be short cut.
+ ExprNodeDesc shortcut = shortcutFunction(udf, newExprs, op);
+ if (shortcut != null) {
+ LOG.debug("Folding expression:" + desc + " -> " + shortcut);
+ return shortcut;
+ }
+ ((ExprNodeGenericFuncDesc) desc).setChildren(newExprs);
          }
- ((ExprNodeGenericFuncDesc) desc).setChildren(newExprs);
- }

- // If in some selected binary operators (=, is null, etc), one of the
- // expressions are
- // constant, add them to colToConstatns as half-deterministic columns.
- if (propagate) {
- propagate(udf, newExprs, op.getSchema(), constants);
+ // If in some selected binary operators (=, is null, etc), one of the
+ // expressions are
+ // constant, add them to colToConstants as half-deterministic columns.
+ if (propagate) {
+ propagate(udf, newExprs, op.getSchema(), constants);
+ }
        }

        return desc;

http://git-wip-us.apache.org/repos/asf/hive/blob/66acd266/ql/src/test/queries/clientpositive/udf_nondeterministic.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/udf_nondeterministic.q b/ql/src/test/queries/clientpositive/udf_nondeterministic.q
new file mode 100644
index 0000000..fcb5162
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/udf_nondeterministic.q
@@ -0,0 +1,11 @@
+drop table if exists udf_tb1;
+drop table if exists udf_tb2;
+
+create table udf_tb1 (year int, month int);
+create table udf_tb2(month int);
+insert into udf_tb1 values(2001, 11);
+insert into udf_tb2 values(11);
+
+explain
+select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month);
+select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month);

http://git-wip-us.apache.org/repos/asf/hive/blob/66acd266/ql/src/test/results/clientpositive/spark/udf_in_file.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/udf_in_file.q.out b/ql/src/test/results/clientpositive/spark/udf_in_file.q.out
index c5b6205..5ba62b0 100644
--- a/ql/src/test/results/clientpositive/spark/udf_in_file.q.out
+++ b/ql/src/test/results/clientpositive/spark/udf_in_file.q.out
@@ -59,7 +59,7 @@ STAGE PLANS:
            alias: value_src
            Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE
            Select Operator
- expressions: in_file(str_val, '../../data/files/test2.dat') (type: boolean), in_file(ch_val, '../../data/files/test2.dat') (type: boolean), in_file(vch_val, '../../data/files/test2.dat') (type: boolean), in_file(str_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(ch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(vch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file('303', '../../data/files/test2.dat') (type: boolean), in_file('304', '../../data/files/test2.dat') (type: boolean), in_file(UDFToString(null), '../../data/files/test2.dat') (type: boolean)
+ expressions: in_file(str_val, '../../data/files/test2.dat') (type: boolean), in_file(ch_val, '../../data/files/test2.dat') (type: boolean), in_file(vch_val, '../../data/files/test2.dat') (type: boolean), in_file(str_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(ch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(vch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file('303', '../../data/files/test2.dat') (type: boolean), in_file('304', '../../data/files/test2.dat') (type: boolean), in_file(null, '../../data/files/test2.dat') (type: boolean)
              outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
              Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE
              Limit

http://git-wip-us.apache.org/repos/asf/hive/blob/66acd266/ql/src/test/results/clientpositive/udf_in_file.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/udf_in_file.q.out b/ql/src/test/results/clientpositive/udf_in_file.q.out
index b9d0d25..6d2c84b 100644
--- a/ql/src/test/results/clientpositive/udf_in_file.q.out
+++ b/ql/src/test/results/clientpositive/udf_in_file.q.out
@@ -59,7 +59,7 @@ STAGE PLANS:
              alias: value_src
              Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE
              Select Operator
- expressions: in_file(str_val, '../../data/files/test2.dat') (type: boolean), in_file(ch_val, '../../data/files/test2.dat') (type: boolean), in_file(vch_val, '../../data/files/test2.dat') (type: boolean), in_file(str_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(ch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(vch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file('303', '../../data/files/test2.dat') (type: boolean), in_file('304', '../../data/files/test2.dat') (type: boolean), in_file(UDFToString(null), '../../data/files/test2.dat') (type: boolean)
+ expressions: in_file(str_val, '../../data/files/test2.dat') (type: boolean), in_file(ch_val, '../../data/files/test2.dat') (type: boolean), in_file(vch_val, '../../data/files/test2.dat') (type: boolean), in_file(str_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(ch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(vch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file('303', '../../data/files/test2.dat') (type: boolean), in_file('304', '../../data/files/test2.dat') (type: boolean), in_file(null, '../../data/files/test2.dat') (type: boolean)
                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
                Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE
                Limit

http://git-wip-us.apache.org/repos/asf/hive/blob/66acd266/ql/src/test/results/clientpositive/udf_nondeterministic.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/udf_nondeterministic.q.out b/ql/src/test/results/clientpositive/udf_nondeterministic.q.out
new file mode 100644
index 0000000..eef5555
--- /dev/null
+++ b/ql/src/test/results/clientpositive/udf_nondeterministic.q.out
@@ -0,0 +1,120 @@
+PREHOOK: query: drop table if exists udf_tb1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists udf_tb1
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table if exists udf_tb2
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists udf_tb2
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table udf_tb1 (year int, month int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@udf_tb1
+POSTHOOK: query: create table udf_tb1 (year int, month int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@udf_tb1
+PREHOOK: query: create table udf_tb2(month int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@udf_tb2
+POSTHOOK: query: create table udf_tb2(month int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@udf_tb2
+PREHOOK: query: insert into udf_tb1 values(2001, 11)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__1
+PREHOOK: Output: default@udf_tb1
+POSTHOOK: query: insert into udf_tb1 values(2001, 11)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__1
+POSTHOOK: Output: default@udf_tb1
+POSTHOOK: Lineage: udf_tb1.month EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+POSTHOOK: Lineage: udf_tb1.year EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+PREHOOK: query: insert into udf_tb2 values(11)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__2
+PREHOOK: Output: default@udf_tb2
+POSTHOOK: query: insert into udf_tb2 values(11)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__2
+POSTHOOK: Output: default@udf_tb2
+POSTHOOK: Lineage: udf_tb2.month EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+PREHOOK: query: explain
+select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: udf_tb1
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: ((year = 2001) and month is not null) (type: boolean)
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: month (type: int)
+ outputColumnNames: _col1
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ TableScan
+ alias: b
+ Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: month is not null (type: boolean)
+ Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: month (type: int)
+ sort order: +
+ Map-reduce partition columns: month (type: int)
+ Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col1 (type: int)
+ 1 month (type: int)
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: unix_timestamp('2001-01-01 00:00:00') (type: bigint)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@udf_tb1
+PREHOOK: Input: default@udf_tb2
+#### A masked pattern was here ####
+POSTHOOK: query: select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@udf_tb1
+POSTHOOK: Input: default@udf_tb2
+#### A masked pattern was here ####
+978336000

Search Discussions

Discussion Posts

Follow ups

Related Discussions

Discussion Navigation
viewthread | post
posts ‹ prev | 1 of 2 | next ›
Discussion Overview
groupcommits @
categorieshive, hadoop
postedMay 31, '15 at 7:50a
activeMay 31, '15 at 7:50a
posts2
users1
websitehive.apache.org

1 user in discussion

Hashutosh: 2 posts

People

Translate

site design / logo © 2021 Grokbase