FAQ
Added: hive/branches/hbase-metastore/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/utils/BloomFilter.java
URL: http://svn.apache.org/viewvc/hive/branches/hbase-metastore/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/utils/BloomFilter.java?rev=1670966&view=auto
==============================================================================
--- hive/branches/hbase-metastore/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/utils/BloomFilter.java (added)
+++ hive/branches/hbase-metastore/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/utils/BloomFilter.java Thu Apr 2 20:59:42 2015
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hadoop.hive.metastore.hbase.utils;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hbase.util.Hash;
+
+/**
+ * Barebones bloom filter implementation
+ */
+public class BloomFilter {
+ private static final Log LOG = LogFactory.getLog(BloomFilter.class.getName());
+ // The cardinality of the set for which we are generating the bloom filter
+ // Default size is 10000
+ private int setSize = 10000;
+ // The probability of false positives we are ready to tolerate
+ // Default is 1%
+ private double falsePositiveProbability = 0.01;
+ // Number of bits used for the filter
+ // Formula: -n*ln(p) / (ln(2)^2)
+ private int numBits;
+ // Number of hashing functions
+ // Formula: m/n * ln(2)
+ private int numHashFunctions;
+ private final Hash hash;
+ private BitVector bitVector;
+
+ public BloomFilter(int setSize, double falsePositiveProbability) {
+ this.setSize = setSize;
+ this.falsePositiveProbability = falsePositiveProbability;
+ this.numBits = calculateFilterSize(this.setSize, this.falsePositiveProbability);
+ this.numHashFunctions = calculateHashFunctions(this.setSize, this.numBits);
+ // Create a bit vector of size numBits
+ this.bitVector = new BitVector(numBits);
+ // Use MurmurHash3
+ hash = Hash.getInstance(Hash.MURMUR_HASH3);
+ }
+
+ /**
+ * Calculate the number of bits in the filter
+ * Also align size to BitVector.HOLDER_SIZE
+ * @param setSize
+ * @param falsePositiveProbability
+ * @return numBits
+ */
+ private int calculateFilterSize(int setSize, double falsePositiveProbability) {
+ int numBits = (int) (-setSize * Math.log(falsePositiveProbability) / (Math.log(2) * Math.log(2)));
+ numBits = numBits + (BitVector.ELEMENT_SIZE - (numBits % BitVector.ELEMENT_SIZE));
+ LOG.info("Bloom Filter size: " + numBits);
+ return numBits;
+ }
+
+ /**
+ * Calculate the number of hash functions needed by the BloomFilter
+ * @param setSize
+ * @param numBits
+ * @return numHashFunctions
+ */
+ private int calculateHashFunctions(int setSize, int numBits) {
+ int numHashFunctions = Math.max(1, (int) Math.round((double) numBits / setSize * Math.log(2)));
+ LOG.info("Number of hashing functions: " + numHashFunctions);
+ return numHashFunctions;
+ }
+
+ /**
+ * @return the underlying BitVector object
+ */
+ public BitVector getBitVector() {
+ return bitVector;
+ }
+
+ public int getFilterSize() {
+ return numBits;
+ }
+
+
+ public int getNumHashFunctions() {
+ return numHashFunctions;
+ }
+
+ /**
+ * Add an item to the filter
+ *
+ * @param item to add
+ */
+ public void addToFilter(byte[] item) {
+ int bitIndex;
+ // Hash the item numHashFunctions times
+ for (int i = 0; i < numHashFunctions; i++) {
+ bitIndex = getBitIndex(item, i);
+ // Set the bit at this index
+ bitVector.setBit(bitIndex);
+ }
+ }
+
+ /**
+ * Check whether the item is present in the filter
+ * @param candidate
+ * @return hasItem (true if the bloom filter contains the item)
+ */
+ public boolean contains(byte[] item) {
+ int bitIndex;
+ boolean hasItem = true;
+ // Hash the item numHashFunctions times
+ for (int i = 0; i < numHashFunctions; i++) {
+ bitIndex = getBitIndex(item, i);
+ hasItem = hasItem && bitVector.isBitSet(bitIndex);
+ if (!hasItem) {
+ return hasItem;
+ }
+ }
+ return hasItem;
+ }
+
+ /**
+ * Hash the item using the i as the seed and return its potential position in the bit vector
+ * Also we negate a negative hash value
+ *
+ * @param item
+ * @param i (for the i-th hash function)
+ * @return position of item in unerlying bit vector
+ */
+ private int getBitIndex(byte[] item, int i) {
+ return Math.abs(hash.hash(item, i) % (numBits));
+ }
+}

Added: hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestAggregateStatsCache.java
URL: http://svn.apache.org/viewvc/hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestAggregateStatsCache.java?rev=1670966&view=auto
==============================================================================
--- hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestAggregateStatsCache.java (added)
+++ hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestAggregateStatsCache.java Thu Apr 2 20:59:42 2015
@@ -0,0 +1,195 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hadoop.hive.metastore.hbase;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
+import org.apache.hadoop.hive.metastore.hbase.AggregateStatsCache.AggrColStatsCached;
+import org.apache.hadoop.hive.metastore.hbase.utils.BloomFilter;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestAggregateStatsCache {
+ static int MAX_CACHE_NODES = 10;
+ static int MAX_PARTITIONS_PER_CACHE_NODE = 10;
+ static String TIME_TO_LIVE = "20s";
+ static float FALSE_POSITIVE_PROBABILITY = (float) 0.01;
+ static float MAX_VARIANCE = (float) 0.1;
+ static AggregateStatsCache cache;
+ static String dbName = "db";
+ static String tablePrefix = "tab";
+ static String partitionPrefix = "part";
+ static String columnPrefix = "col";
+ static int numTables = 2;
+ static int numPartitions = 20;
+ static int numColumns = 5;
+ static List<String> tables = new ArrayList<String>();
+ static List<String> tabParts = new ArrayList<String>();
+ static List<String> tabCols = new ArrayList<String>();
+
+ @BeforeClass
+ public static void beforeTest() {
+ // All data intitializations
+ initializeTables();
+ initializePartitions();
+ initializeColumns();
+ }
+
+ private static void initializeTables() {
+ for (int i = 1; i <= numTables; i++) {
+ // tab1, tab2
+ tables.add(tablePrefix + i);
+ }
+ }
+
+ private static void initializePartitions() {
+ for (int i = 1; i <= numPartitions; i++) {
+ // part1 ... part20
+ tabParts.add(partitionPrefix + i);
+ }
+ }
+
+ private static void initializeColumns() {
+ for (int i = 1; i <= numColumns; i++) {
+ // part1 ... part20
+ tabCols.add(columnPrefix + i);
+ }
+ }
+
+ @AfterClass
+ public static void afterTest() {
+ }
+
+ @Before
+ public void setUp() {
+ HiveConf hiveConf = new HiveConf();
+ hiveConf.setIntVar(HiveConf.ConfVars.METASTORE_HBASE_AGGREGATE_STATS_CACHE_SIZE,
+ MAX_CACHE_NODES);
+ hiveConf.setIntVar(HiveConf.ConfVars.METASTORE_HBASE_AGGREGATE_STATS_CACHE_MAX_PARTITIONS,
+ MAX_PARTITIONS_PER_CACHE_NODE);
+ hiveConf.setFloatVar(
+ HiveConf.ConfVars.METASTORE_HBASE_AGGREGATE_STATS_CACHE_FALSE_POSITIVE_PROBABILITY,
+ FALSE_POSITIVE_PROBABILITY);
+ hiveConf.setFloatVar(HiveConf.ConfVars.METASTORE_HBASE_AGGREGATE_STATS_CACHE_MAX_VARIANCE,
+ MAX_VARIANCE);
+ hiveConf.setVar(HiveConf.ConfVars.METASTORE_HBASE_CACHE_TIME_TO_LIVE, TIME_TO_LIVE);
+ cache = AggregateStatsCache.getInstance(hiveConf);
+
+ }
+
+ @After
+ public void tearDown() {
+ }
+
+ @Test
+ public void testBasicAddAndGet() {
+ // Add a dummy aggregate stats object for parts 1-9 of tab1 for col1
+ int minPart = 1;
+ int maxPart = 9;
+ String tblName = tables.get(0);
+ String colName = tabCols.get(0);
+ ColumnStatisticsObj aggrColStats = getDummyLongColStat(colName);
+ // Prepare the bloom filter
+ BloomFilter bloomFilter =
+ new BloomFilter(MAX_PARTITIONS_PER_CACHE_NODE, FALSE_POSITIVE_PROBABILITY);
+ List <String> partNames = new ArrayList<String>();
+ for (int i = minPart; i <= maxPart; i++) {
+ String partName = tabParts.get(i);
+ partNames.add(partName);
+ bloomFilter.addToFilter(partName.getBytes());
+ }
+ // Now add to cache
+ cache.add(dbName, tblName, colName, maxPart-minPart+1, aggrColStats, bloomFilter);
+ // Now get from cache
+ AggrColStatsCached aggrStatsCached = cache.get(dbName, tblName, colName, partNames);
+ Assert.assertNotNull(aggrStatsCached);
+
+ ColumnStatisticsObj aggrColStatsCached = aggrStatsCached.getColStats();
+ Assert.assertEquals(aggrColStats, aggrColStatsCached);
+
+ // Now get a non-existant entry
+ aggrStatsCached = cache.get("dbNotThere", tblName, colName, partNames);
+ Assert.assertNull(aggrStatsCached);
+ }
+
+ @Test
+ public void testAddGetWithVariance() {
+ // Add a dummy aggregate stats object for parts 1-9 of tab1 for col1
+ int minPart = 1;
+ int maxPart = 9;
+ String tblName = tables.get(0);
+ String colName = tabCols.get(0);
+ ColumnStatisticsObj aggrColStats = getDummyLongColStat(colName);
+ // Prepare the bloom filter
+ BloomFilter bloomFilter =
+ new BloomFilter(MAX_PARTITIONS_PER_CACHE_NODE, FALSE_POSITIVE_PROBABILITY);
+ // The paritions we'll eventually request from the cache
+ List <String> partNames = new ArrayList<String>();
+ for (int i = minPart; i <= maxPart; i++) {
+ String partName = tabParts.get(i);
+ // Only add 50% partitions to partnames so that we can see if the request fails
+ if (i < maxPart / 2) {
+ partNames.add(partName);
+ }
+ bloomFilter.addToFilter(partName.getBytes());
+ }
+ // Now add to cache
+ cache.add(dbName, tblName, colName, maxPart-minPart+1, aggrColStats, bloomFilter);
+ // Now get from cache
+ System.out.println(partNames);
+ AggrColStatsCached aggrStatsCached = cache.get(dbName, tblName, colName, partNames);
+ Assert.assertNull(aggrStatsCached);
+ }
+
+ @Test
+ public void testMultiThreaded() {
+ }
+
+ private ColumnStatisticsObj getDummyLongColStat(String colName) {
+ ColumnStatisticsObj aggrColStats = new ColumnStatisticsObj();
+ aggrColStats.setColName(colName);
+ aggrColStats.setColType("long");
+ LongColumnStatsData longStatsData = new LongColumnStatsData();
+ // Set some random values
+ int highVal = 100;
+ int lowVal = 10;
+ int numDVs = 50;
+ int numNulls = 5;
+ longStatsData.setHighValue(highVal);
+ longStatsData.setLowValue(lowVal);
+ longStatsData.setNumDVs(numDVs);
+ longStatsData.setNumNulls(numNulls);
+ ColumnStatisticsData aggrColStatsData = new ColumnStatisticsData();
+ aggrColStatsData.setLongStats(longStatsData);
+ aggrColStats.setStatsData(aggrColStatsData);
+ return aggrColStats;
+ }
+}

Added: hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/utils/TestBitVector.java
URL: http://svn.apache.org/viewvc/hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/utils/TestBitVector.java?rev=1670966&view=auto
==============================================================================
--- hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/utils/TestBitVector.java (added)
+++ hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/utils/TestBitVector.java Thu Apr 2 20:59:42 2015
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hadoop.hive.metastore.hbase.utils;
+
+import org.apache.hadoop.hive.metastore.hbase.utils.BitVector;
+import org.junit.Assert;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestBitVector {
+
+ static int BIT_VECTOR_SIZE = 32;
+ BitVector bitVector;
+
+ @BeforeClass
+ public static void beforeTest() {
+ }
+
+ @AfterClass
+ public static void afterTest() {
+ }
+
+ @Before
+ public void setUp() {
+ // Create a new BitVector
+ bitVector = new BitVector(BIT_VECTOR_SIZE);
+ }
+
+
+ @After
+ public void tearDown() {
+ }
+
+ @Test
+ public void testSetAll() {
+ // Set bits
+ bitVector.setAll();
+ Assert.assertEquals("11111111111111111111111111111111", bitVector.toString());
+ }
+
+ @Test
+ public void testClearAll() {
+ // Clear all bits
+ bitVector.clearAll();
+ Assert.assertEquals("00000000000000000000000000000000", bitVector.toString());
+ }
+
+ @Test
+ public void testSetUnsetBit() {
+ // Set 3rd bit
+ bitVector.setBit(2);
+ Assert.assertEquals("00100000000000000000000000000000", bitVector.toString());
+ // Now check if 3rd bit is set
+ Assert.assertTrue(bitVector.isBitSet(2));
+ // Now set 30th bit
+ bitVector.setBit(29);
+ Assert.assertEquals("00100000000000000000000000000100", bitVector.toString());
+ // Now check if 30th bit is set
+ Assert.assertTrue(bitVector.isBitSet(29));
+
+ // Now unset 3rd bit
+ bitVector.unSetBit(2);
+ Assert.assertEquals("00000000000000000000000000000100", bitVector.toString());
+ // Now check if 3rd bit is unset
+ Assert.assertFalse(bitVector.isBitSet(2));
+ // Now unset 30th bit
+ bitVector.unSetBit(29);
+ Assert.assertEquals("00000000000000000000000000000000", bitVector.toString());
+ // Now check if 30th bit is unset
+ Assert.assertFalse(bitVector.isBitSet(29));
+ }
+}

Added: hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/utils/TestBloomFilter.java
URL: http://svn.apache.org/viewvc/hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/utils/TestBloomFilter.java?rev=1670966&view=auto
==============================================================================
--- hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/utils/TestBloomFilter.java (added)
+++ hive/branches/hbase-metastore/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/utils/TestBloomFilter.java Thu Apr 2 20:59:42 2015
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hadoop.hive.metastore.hbase.utils;
+
+import org.apache.hadoop.hive.metastore.hbase.utils.BloomFilter;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestBloomFilter {
+ static final int SET_SIZE = 50;
+ static final double FALSE_POSITIVE_PROBABILITY = 0.01;
+ // Pre-calculated for the above set size and fpp
+ static final int FILTER_SIZE = 480;
+ static final int NUM_HASH_FUNCTIONS = 7;
+ BloomFilter bloomFilter;
+ // Items that we'll add to the filter
+ String[] items = {"Part1=Val1", "Part2=Val2", "Part3=Val3", "Part4=Val4", "Part5=Val5"};
+
+ @BeforeClass
+ public static void beforeTest() {
+ }
+
+ @AfterClass
+ public static void afterTest() {
+ }
+
+ @Before
+ public void setUp() {
+ bloomFilter = new BloomFilter(SET_SIZE, FALSE_POSITIVE_PROBABILITY);
+ }
+
+ @After
+ public void tearDown() {
+ }
+
+ @Test
+ public void testFilterAndHashSize() {
+ Assert.assertEquals(bloomFilter.getFilterSize(), FILTER_SIZE);
+ Assert.assertEquals(bloomFilter.getNumHashFunctions(), NUM_HASH_FUNCTIONS);
+ }
+
+ @Test
+ public void testFilterFunctions() {
+ // Add all items to the bloom filter
+ // (since bloom filter returns false positives, no point testing for negative cases)
+ for (String item: items) {
+ bloomFilter.addToFilter(item.getBytes());
+ }
+ // Test for presence
+ for (String item: items) {
+ Assert.assertTrue(bloomFilter.contains(item.getBytes()));
+ }
+ // Clear all bits
+ bloomFilter.getBitVector().clearAll();
+ // Test for presence now - should fail
+ for (String item: items) {
+ Assert.assertFalse(bloomFilter.contains(item.getBytes()));
+ }
+ }
+
+}

Search Discussions

Discussion Posts

Previous

Related Discussions

Discussion Navigation
viewthread | post
posts ‹ prev | 2 of 2 | next ›
Discussion Overview
groupcommits @
categorieshive, hadoop
postedApr 2, '15 at 8:59p
activeApr 2, '15 at 8:59p
posts2
users1
websitehive.apache.org

1 user in discussion

Vgumashta: 2 posts

People

Translate

site design / logo © 2021 Grokbase