FAQ
Author: kevinwilfong
Date: Wed Jan 9 17:59:23 2013
New Revision: 1430979

URL: http://svn.apache.org/viewvc?rev=1430979&view=rev
Log:
HIVE-3552. performant manner for performing cubes/rollups/grouping sets for a high number of grouping set keys.

Added:
     hive/trunk/data/files/grouping_sets1.txt
     hive/trunk/data/files/grouping_sets2.txt
     hive/trunk/ql/src/test/queries/clientnegative/groupby_grouping_sets6.q
     hive/trunk/ql/src/test/queries/clientnegative/groupby_grouping_sets7.q
     hive/trunk/ql/src/test/queries/clientpositive/groupby_grouping_sets2.q
     hive/trunk/ql/src/test/queries/clientpositive/groupby_grouping_sets3.q
     hive/trunk/ql/src/test/queries/clientpositive/groupby_grouping_sets4.q
     hive/trunk/ql/src/test/queries/clientpositive/groupby_grouping_sets5.q
     hive/trunk/ql/src/test/results/clientnegative/groupby_grouping_sets6.q.out
     hive/trunk/ql/src/test/results/clientnegative/groupby_grouping_sets7.q.out
     hive/trunk/ql/src/test/results/clientpositive/groupby_grouping_sets2.q.out
     hive/trunk/ql/src/test/results/clientpositive/groupby_grouping_sets3.q.out
     hive/trunk/ql/src/test/results/clientpositive/groupby_grouping_sets4.q.out
     hive/trunk/ql/src/test/results/clientpositive/groupby_grouping_sets5.q.out
Modified:
     hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
     hive/trunk/conf/hive-default.xml.template
     hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
     hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java
     hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
     hive/trunk/ql/src/test/results/compiler/plan/groupby1.q.xml
     hive/trunk/ql/src/test/results/compiler/plan/groupby2.q.xml
     hive/trunk/ql/src/test/results/compiler/plan/groupby3.q.xml
     hive/trunk/ql/src/test/results/compiler/plan/groupby4.q.xml
     hive/trunk/ql/src/test/results/compiler/plan/groupby5.q.xml
     hive/trunk/ql/src/test/results/compiler/plan/groupby6.q.xml

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1430979&r1=1430978&r2=1430979&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Wed Jan 9 17:59:23 2013
@@ -407,6 +407,7 @@ public class HiveConf extends Configurat
      HIVEMULTIGROUPBYSINGLEREDUCER("hive.multigroupby.singlereducer", true),
      HIVE_MAP_GROUPBY_SORT("hive.map.groupby.sorted", false),
      HIVE_GROUPBY_ORDERBY_POSITION_ALIAS("hive.groupby.orderby.position.alias", false),
+ HIVE_NEW_JOB_GROUPING_SET_CARDINALITY("hive.new.job.grouping.set.cardinality", 30),

      // for hive udtf operator
      HIVEUDTFAUTOPROGRESS("hive.udtf.auto.progress", false),

Modified: hive/trunk/conf/hive-default.xml.template
URL: http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml.template?rev=1430979&r1=1430978&r2=1430979&view=diff
==============================================================================
--- hive/trunk/conf/hive-default.xml.template (original)
+++ hive/trunk/conf/hive-default.xml.template Wed Jan 9 17:59:23 2013
@@ -526,6 +526,22 @@
  </property>

  <property>
+ <name>hive.new.job.grouping.set.cardinality</name>
+ <value>30</value>
+ <description>
+ Whether a new map-reduce job should be launched for grouping sets/rollups/cubes.
+ For a query like: select a, b, c, count(1) from T group by a, b, c with rollup;
+ 4 rows are created per row: (a, b, c), (a, b, null), (a, null, null), (null, null, null).
+ This can lead to explosion across map-reduce boundary if the cardinality of T is very high,
+ and map-side aggregation does not do a very good job.
+
+ This parameter decides if hive should add an additional map-reduce job. If the grouping set
+ cardinality (4 in the example above), is more than this value, a new MR job is added under the
+ assumption that the orginal group by will reduce the data size.
+ </description>
+</property>
+
+<property>
    <name>hive.join.emit.interval</name>
    <value>1000</value>
    <description>How many rows in the right-most join operand Hive should buffer before emitting the join result. </description>

Added: hive/trunk/data/files/grouping_sets1.txt
URL: http://svn.apache.org/viewvc/hive/trunk/data/files/grouping_sets1.txt?rev=1430979&view=auto
==============================================================================
--- hive/trunk/data/files/grouping_sets1.txt (added)
+++ hive/trunk/data/files/grouping_sets1.txt Wed Jan 9 17:59:23 2013
@@ -0,0 +1,6 @@
+8 1 1
+5 1 2
+1 1 3
+2 2 4
+2 3 5
+3 2 8

Added: hive/trunk/data/files/grouping_sets2.txt
URL: http://svn.apache.org/viewvc/hive/trunk/data/files/grouping_sets2.txt?rev=1430979&view=auto
==============================================================================
--- hive/trunk/data/files/grouping_sets2.txt (added)
+++ hive/trunk/data/files/grouping_sets2.txt Wed Jan 9 17:59:23 2013
@@ -0,0 +1,6 @@
+8 1 1
+1 2 2
+1 1 3
+2 2 4
+2 3 5
+2 2 8

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java?rev=1430979&r1=1430978&r2=1430979&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java Wed Jan 9 17:59:23 2013
@@ -313,6 +313,17 @@ public enum ErrorMsg {
    INVALID_POSITION_ALIAS_IN_ORDERBY(10221,
      "Invalid position alias in Order By\n"),

+ HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_SKEW(10225,
+ "An additional MR job is introduced since the number of rows created per input row " +
+ "due to grouping sets is more than hive.new.job.grouping.set.cardinality. There is no need " +
+ "to handle skew separately. set hive.groupby.skewindata to false."),
+ HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_DISTINCTS(10226,
+ "An additional MR job is introduced since the cardinality of grouping sets " +
+ "is more than hive.new.job.grouping.set.cardinality. This functionality is not supported " +
+ "with distincts. Either set hive.new.job.grouping.set.cardinality to a high number " +
+ "(higher than the number of rows per input row due to grouping sets in the query), or " +
+ "rewrite the query to not use distincts."),
+
    SCRIPT_INIT_ERROR(20000, "Unable to initialize custom script."),
    SCRIPT_IO_ERROR(20001, "An error occurred while reading or writing to your custom script. "
        + "It may have crashed with an error."),

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java?rev=1430979&r1=1430978&r2=1430979&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java Wed Jan 9 17:59:23 2013
@@ -214,9 +214,6 @@ public class GroupByOperator extends Ope
          HiveConf.ConfVars.HIVESENDHEARTBEAT);
      countAfterReport = 0;
      groupingSetsPresent = conf.isGroupingSetsPresent();
- groupingSets = conf.getListGroupingSets();
- groupingSetsPosition = conf.getGroupingSetPosition();
-
      ObjectInspector rowInspector = inputObjInspectors[0];

      // init keyFields
@@ -236,6 +233,8 @@ public class GroupByOperator extends Ope
      // Initialize the constants for the grouping sets, so that they can be re-used for
      // each row
      if (groupingSetsPresent) {
+ groupingSets = conf.getListGroupingSets();
+ groupingSetsPosition = conf.getGroupingSetPosition();
        newKeysGroupingSets = new ArrayList<Object>();
        groupingSetsBitSet = new ArrayList<FastBitSet>();

Search Discussions

Discussion Posts

Follow ups

Related Discussions

Discussion Navigation
viewthread | post
posts ‹ prev | 1 of 3 | next ›
Discussion Overview
groupcommits @
categorieshive, hadoop
postedJan 9, '13 at 5:59p
activeJan 9, '13 at 5:59p
posts3
users1
websitehive.apache.org

1 user in discussion

Kevinwilfong: 3 posts

People

Translate

site design / logo © 2021 Grokbase