FAQ
HIVE-12025 refactor bucketId generating code (Eugene Koifman, reviewed by Prashanth Jayachandran, Sergey Shelukhin, Elliot West)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/d4c0fa46
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/d4c0fa46
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/d4c0fa46

Branch: refs/heads/branch-1
Commit: d4c0fa4623cea0b32d47447dd32e2610be69876c
Parents: 7bf681d
Author: Eugene Koifman <ekoifman@hortonworks.com>
Authored: Sat Oct 10 11:07:10 2015 -0700
Committer: Eugene Koifman <ekoifman@hortonworks.com>
Committed: Sat Oct 10 11:07:10 2015 -0700

----------------------------------------------------------------------
  .../hadoop/hive/ql/exec/FileSinkOperator.java | 9 ++++---
  .../hadoop/hive/ql/exec/ReduceSinkOperator.java | 23 ++++++++----------
  .../hive/ql/io/DefaultHivePartitioner.java | 3 ++-
  .../hive/ql/udf/generic/GenericUDFHash.java | 11 ++++-----
  .../hive/ql/lockmgr/TestDbTxnManager.java | 7 ++++--
  .../objectinspector/ObjectInspectorUtils.java | 13 ++++++----
  .../TestObjectInspectorUtils.java | 25 ++++++++++++++++++++
  7 files changed, 59 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/d4c0fa46/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
index 553113e..ed791c0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
@@ -808,12 +808,11 @@ public class FileSinkOperator extends TerminalOperator<FileSinkDesc> implements
      if (!multiFileSpray) {
        return 0;
      } else {
- int keyHashCode = 0;
- for (int i = 0; i < partitionEval.length; i++) {
- Object o = partitionEval[i].evaluate(row);
- keyHashCode = keyHashCode * 31
- + ObjectInspectorUtils.hashCode(o, partitionObjectInspectors[i]);
+ Object[] bucketFieldValues = new Object[partitionEval.length];
+ for(int i = 0; i < partitionEval.length; i++) {
+ bucketFieldValues[i] = partitionEval[i].evaluate(row);
        }
+ int keyHashCode = ObjectInspectorUtils.getBucketHashCode(bucketFieldValues, partitionObjectInspectors);
        key.setHashCode(keyHashCode);
        int bucketNum = prtner.getBucket(key, null, totalFiles);
        return bucketMap.get(bucketNum);

http://git-wip-us.apache.org/repos/asf/hive/blob/d4c0fa46/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java
index f1df608..dd08210 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java
@@ -405,27 +405,24 @@ public class ReduceSinkOperator extends TerminalOperator<ReduceSinkDesc>
    }

    private int computeBucketNumber(Object row, int numBuckets) throws HiveException {
- int buckNum = 0;
-
      if (conf.getWriteType() == AcidUtils.Operation.UPDATE ||
          conf.getWriteType() == AcidUtils.Operation.DELETE) {
- // We don't need to evalute the hash code. Instead read the bucket number directly from
+ // We don't need to evaluate the hash code. Instead read the bucket number directly from
        // the row. I don't need to evaluate any expressions as I know I am reading the ROW__ID
        // column directly.
        Object recIdValue = acidRowInspector.getStructFieldData(row, recIdField);
- buckNum = bucketInspector.get(recIdInspector.getStructFieldData(recIdValue, bucketField));
+ int buckNum = bucketInspector.get(recIdInspector.getStructFieldData(recIdValue, bucketField));
        if (isLogTraceEnabled) {
          LOG.trace("Acid choosing bucket number " + buckNum);
        }
+ return buckNum;
      } else {
+ Object[] bucketFieldValues = new Object[bucketEval.length];
        for (int i = 0; i < bucketEval.length; i++) {
- Object o = bucketEval[i].evaluate(row);
- buckNum = buckNum * 31 + ObjectInspectorUtils.hashCode(o, bucketObjectInspectors[i]);
+ bucketFieldValues[i] = bucketEval[i].evaluate(row);
        }
+ return ObjectInspectorUtils.getBucketNumber(bucketFieldValues, bucketObjectInspectors, numBuckets);
      }
-
- // similar to hive's default partitioner, refer DefaultHivePartitioner
- return (buckNum & Integer.MAX_VALUE) % numBuckets;
    }

    private void populateCachedDistributionKeys(Object row, int index) throws HiveException {
@@ -476,11 +473,11 @@ public class ReduceSinkOperator extends TerminalOperator<ReduceSinkDesc>
          keyHashCode = 1;
        }
      } else {
- for (int i = 0; i < partitionEval.length; i++) {
- Object o = partitionEval[i].evaluate(row);
- keyHashCode = keyHashCode * 31
- + ObjectInspectorUtils.hashCode(o, partitionObjectInspectors[i]);
+ Object[] bucketFieldValues = new Object[partitionEval.length];
+ for(int i = 0; i < partitionEval.length; i++) {
+ bucketFieldValues[i] = partitionEval[i].evaluate(row);
        }
+ keyHashCode = ObjectInspectorUtils.getBucketHashCode(bucketFieldValues, partitionObjectInspectors);
      }
      int hashCode = buckNum < 0 ? keyHashCode : keyHashCode * 31 + buckNum;
      if (isLogTraceEnabled) {

http://git-wip-us.apache.org/repos/asf/hive/blob/d4c0fa46/ql/src/java/org/apache/hadoop/hive/ql/io/DefaultHivePartitioner.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/DefaultHivePartitioner.java b/ql/src/java/org/apache/hadoop/hive/ql/io/DefaultHivePartitioner.java
index 6a91cb8..6a14fb8 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/DefaultHivePartitioner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/DefaultHivePartitioner.java
@@ -18,6 +18,7 @@

  package org.apache.hadoop.hive.ql.io;

+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
  import org.apache.hadoop.mapred.lib.HashPartitioner;

  /** Partition keys by their {@link Object#hashCode()}. */
@@ -26,7 +27,7 @@ public class DefaultHivePartitioner<K2, V2> extends HashPartitioner<K2, V2> impl
    /** Use {@link Object#hashCode()} to partition. */
    @Override
    public int getBucket(K2 key, V2 value, int numBuckets) {
- return (key.hashCode() & Integer.MAX_VALUE) % numBuckets;
+ return ObjectInspectorUtils.getBucketNumber(key.hashCode(), numBuckets);
    }

  }

http://git-wip-us.apache.org/repos/asf/hive/blob/d4c0fa46/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFHash.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFHash.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFHash.java
index 474f404..fd1fe92 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFHash.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFHash.java
@@ -18,7 +18,6 @@

  package org.apache.hadoop.hive.ql.udf.generic;

-import org.apache.commons.lang.StringUtils;
  import org.apache.hadoop.hive.ql.exec.Description;
  import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
  import org.apache.hadoop.hive.ql.metadata.HiveException;
@@ -45,13 +44,11 @@ public class GenericUDFHash extends GenericUDF {

    @Override
    public Object evaluate(DeferredObject[] arguments) throws HiveException {
- // See
- // http://java.sun.com/j2se/1.5.0/docs/api/java/util/List.html#hashCode()
- int r = 0;
- for (int i = 0; i < arguments.length; i++) {
- r = r * 31
- + ObjectInspectorUtils.hashCode(arguments[i].get(), argumentOIs[i]);
+ Object[] fieldValues = new Object[arguments.length];
+ for(int i = 0; i < arguments.length; i++) {
+ fieldValues[i] = arguments[i].get();
      }
+ int r = ObjectInspectorUtils.getBucketHashCode(fieldValues, argumentOIs);
      result.set(r);
      return result;
    }

http://git-wip-us.apache.org/repos/asf/hive/blob/d4c0fa46/ql/src/test/org/apache/hadoop/hive/ql/lockmgr/TestDbTxnManager.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/lockmgr/TestDbTxnManager.java b/ql/src/test/org/apache/hadoop/hive/ql/lockmgr/TestDbTxnManager.java
index 8a53ec5..ff38ff3 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/lockmgr/TestDbTxnManager.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/lockmgr/TestDbTxnManager.java
@@ -20,13 +20,16 @@ package org.apache.hadoop.hive.ql.lockmgr;
  import org.apache.hadoop.fs.Path;
  import org.apache.hadoop.hive.conf.HiveConf;
  import org.apache.hadoop.hive.metastore.api.FieldSchema;
-import org.apache.hadoop.hive.metastore.api.ShowLocksResponse;import org.apache.hadoop.hive.metastore.api.ShowLocksResponseElement;import org.apache.hadoop.hive.metastore.txn.TxnDbUtil;
+import org.apache.hadoop.hive.metastore.api.ShowLocksResponse;
+import org.apache.hadoop.hive.metastore.api.ShowLocksResponseElement;
+import org.apache.hadoop.hive.metastore.txn.TxnDbUtil;
  import org.apache.hadoop.hive.ql.Context;
  import org.apache.hadoop.hive.ql.ErrorMsg;
  import org.apache.hadoop.hive.ql.QueryPlan;
  import org.apache.hadoop.hive.ql.hooks.ReadEntity;
  import org.apache.hadoop.hive.ql.hooks.WriteEntity;
-import org.apache.hadoop.hive.ql.metadata.DummyPartition;import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.DummyPartition;
+import org.apache.hadoop.hive.ql.metadata.Partition;
  import org.apache.hadoop.hive.ql.metadata.Table;
  import org.apache.hadoop.hive.ql.session.SessionState;
  import org.apache.hadoop.hive.ql.txn.AcidHouseKeeperService;

http://git-wip-us.apache.org/repos/asf/hive/blob/d4c0fa46/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java
index 902ff35..288ad8d 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java
@@ -502,18 +502,23 @@ public final class ObjectInspectorUtils {
     * @return the bucket number
     */
    public static int getBucketNumber(Object[] bucketFields, ObjectInspector[] bucketFieldInspectors, int totalBuckets) {
- int hashCode = getBucketHashCode(bucketFields, bucketFieldInspectors);
- int bucketID = (hashCode & Integer.MAX_VALUE) % totalBuckets;
- return bucketID;
+ return getBucketNumber(getBucketHashCode(bucketFields, bucketFieldInspectors), totalBuckets);
    }

    /**
+ * https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL+BucketedTables
+ * @param hashCode as produced by {@link #getBucketHashCode(Object[], ObjectInspector[])}
+ */
+ public static int getBucketNumber(int hashCode, int numberOfBuckets) {
+ return (hashCode & Integer.MAX_VALUE) % numberOfBuckets;
+ }
+ /**
     * Computes the hash code for the given bucketed fields
     * @param bucketFields
     * @param bucketFieldInspectors
     * @return
     */
- private static int getBucketHashCode(Object[] bucketFields, ObjectInspector[] bucketFieldInspectors) {
+ public static int getBucketHashCode(Object[] bucketFields, ObjectInspector[] bucketFieldInspectors) {
      int hashCode = 0;
      for (int i = 0; i < bucketFields.length; i++) {
        int fieldHash = ObjectInspectorUtils.hashCode(bucketFields[i], bucketFieldInspectors[i]);

http://git-wip-us.apache.org/repos/asf/hive/blob/d4c0fa46/serde/src/test/org/apache/hadoop/hive/serde2/objectinspector/TestObjectInspectorUtils.java
----------------------------------------------------------------------
diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/objectinspector/TestObjectInspectorUtils.java b/serde/src/test/org/apache/hadoop/hive/serde2/objectinspector/TestObjectInspectorUtils.java
index f3fd6fa..6463363 100644
--- a/serde/src/test/org/apache/hadoop/hive/serde2/objectinspector/TestObjectInspectorUtils.java
+++ b/serde/src/test/org/apache/hadoop/hive/serde2/objectinspector/TestObjectInspectorUtils.java
@@ -109,4 +109,29 @@ public class TestObjectInspectorUtils extends TestCase {
      }

    }
+ public void testBucketIdGeneration() {
+ ArrayList<String> fieldNames = new ArrayList<String>();
+ fieldNames.add("firstInteger");
+ fieldNames.add("secondString");
+ fieldNames.add("thirdBoolean");
+ ArrayList<ObjectInspector> fieldObjectInspectors = new ArrayList<ObjectInspector>();
+ fieldObjectInspectors
+ .add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
+ fieldObjectInspectors
+ .add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
+ fieldObjectInspectors
+ .add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
+
+ StandardStructObjectInspector soi1 = ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldObjectInspectors);
+ ArrayList<Object> struct = new ArrayList<Object>(3);
+ struct.add(1);
+ struct.add("two");
+ struct.add(true);
+
+ int hashCode = ObjectInspectorUtils.getBucketHashCode(struct.toArray(), fieldObjectInspectors.toArray(new ObjectInspector[fieldObjectInspectors.size()]));
+ assertEquals("", 3574518, hashCode);
+ int bucketId = ObjectInspectorUtils.getBucketNumber(struct.toArray(), fieldObjectInspectors.toArray(new ObjectInspector[fieldObjectInspectors.size()]), 16);
+ assertEquals("", 6, bucketId);
+ assertEquals("", bucketId, ObjectInspectorUtils.getBucketNumber(hashCode, 16));
+ }
  }

Search Discussions

Discussion Posts

Previous

Follow ups

Related Discussions

Discussion Navigation
viewthread | post
posts ‹ prev | 5 of 6 | next ›
Discussion Overview
groupcommits @
categorieshive, hadoop
postedOct 10, '15 at 5:42p
activeOct 10, '15 at 6:31p
posts6
users1
websitehive.apache.org

1 user in discussion

Ekoifman: 6 posts

People

Translate

site design / logo © 2021 Grokbase