FAQ
Repository: hive
Updated Branches:
   refs/heads/master 157d82515 -> 7e3605dd9


HIVE-13040 : Handle empty bucket creations more efficiently (Ashutosh Chauhan via Prasanth J)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/7e3605dd
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/7e3605dd
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/7e3605dd

Branch: refs/heads/master
Commit: 7e3605dd9056efa4fa8fe36d74a064e6f59d1e83
Parents: 157d825
Author: Ashutosh Chauhan <hashutosh@apache.org>
Authored: Wed Feb 10 15:51:04 2016 -0800
Committer: Ashutosh Chauhan <hashutosh@apache.org>
Committed: Sun Feb 28 00:42:56 2016 -0800

----------------------------------------------------------------------
  .../hadoop/hive/ql/exec/StatsNoJobTask.java | 31 +++++++------
  .../apache/hadoop/hive/ql/exec/Utilities.java | 4 +-
  .../org/apache/hadoop/hive/ql/io/AcidUtils.java | 16 ++++---
  .../hadoop/hive/ql/io/orc/OrcInputFormat.java | 14 +++---
  .../hadoop/hive/ql/io/orc/OrcOutputFormat.java | 17 +++----
  .../hive/ql/txn/compactor/CompactorMR.java | 2 +-
  .../hadoop/hive/ql/txn/compactor/Initiator.java | 2 +-
  .../hive/ql/io/orc/TestInputOutputFormat.java | 31 +++++++------
  .../dynpart_sort_opt_vectorization.q.out | 4 +-
  .../spark/vector_outer_join1.q.out | 40 ++++++++--------
  .../spark/vector_outer_join4.q.out | 40 ++++++++--------
  .../tez/dynpart_sort_opt_vectorization.q.out | 8 ++--
  .../tez/dynpart_sort_optimization.q.out | 4 +-
  .../clientpositive/tez/union_fast_stats.q.out | 10 ++--
  .../clientpositive/tez/vector_outer_join1.q.out | 48 ++++++++++----------
  .../clientpositive/tez/vector_outer_join4.q.out | 48 ++++++++++----------
  .../clientpositive/union_fast_stats.q.out | 16 +++----
  .../apache/hadoop/hive/shims/Hadoop23Shims.java | 2 +-
  18 files changed, 172 insertions(+), 165 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java
index 2f0a167..175dbdb 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java
@@ -126,7 +126,7 @@ public class StatsNoJobTask extends Task<StatsNoJobWork> implements Serializable

    class StatsCollection implements Runnable {

- private Partition partn;
+ private final Partition partn;

      public StatsCollection(Partition part) {
        this.partn = part;
@@ -151,7 +151,7 @@ public class StatsNoJobTask extends Task<StatsNoJobWork> implements Serializable
          boolean statsAvailable = false;
          for(FileStatus file: fileList) {
            if (!file.isDir()) {
- InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtil.newInstance(
+ InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(
                  partn.getInputFormatClass(), jc);
              InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0,
                  new String[] { partn.getLocation() });
@@ -195,7 +195,7 @@ public class StatsNoJobTask extends Task<StatsNoJobWork> implements Serializable
              "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));

          // Before updating the partition params, if any partition params is null
- // and if statsReliable is true then updatePartition() function will fail
+ // and if statsReliable is true then updatePartition() function will fail
          // the task by returning 1
          if (work.isStatsReliable()) {
            partUpdates.put(tPart.getSd().getLocation(), null);
@@ -246,22 +246,27 @@ public class StatsNoJobTask extends Task<StatsNoJobWork> implements Serializable
            boolean statsAvailable = false;
            for(FileStatus file: fileList) {
              if (!file.isDir()) {
- InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtil.newInstance(
+ InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(
                    table.getInputFormatClass(), jc);
                InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { table
                    .getDataLocation().toString() });
- org.apache.hadoop.mapred.RecordReader<?, ?> recordReader =
- inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
- StatsProvidingRecordReader statsRR;
- if (recordReader instanceof StatsProvidingRecordReader) {
- statsRR = (StatsProvidingRecordReader) recordReader;
- numRows += statsRR.getStats().getRowCount();
- rawDataSize += statsRR.getStats().getRawDataSize();
- fileSize += file.getLen();
+ if (file.getLen() == 0) {
                  numFiles += 1;
                  statsAvailable = true;
+ } else {
+ org.apache.hadoop.mapred.RecordReader<?, ?> recordReader =
+ inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
+ StatsProvidingRecordReader statsRR;
+ if (recordReader instanceof StatsProvidingRecordReader) {
+ statsRR = (StatsProvidingRecordReader) recordReader;
+ numRows += statsRR.getStats().getRowCount();
+ rawDataSize += statsRR.getStats().getRawDataSize();
+ fileSize += file.getLen();
+ numFiles += 1;
+ statsAvailable = true;
+ }
+ recordReader.close();
                }
- recordReader.close();
              }
            }


http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
index 7a62ff9..ab0635e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
@@ -1480,7 +1480,7 @@ public final class Utilities {

          taskIDToFile = removeTempOrDuplicateFiles(items, fs);
          // if the table is bucketed and enforce bucketing, we should check and generate all buckets
- if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null) {
+ if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) {
            // refresh the file list
            items = fs.listStatus(parts[i].getPath());
            // get the missing buckets and generate empty buckets
@@ -1500,7 +1500,7 @@ public final class Utilities {
        FileStatus[] items = fs.listStatus(path);
        taskIDToFile = removeTempOrDuplicateFiles(items, fs);
        if(taskIDToFile != null && taskIDToFile.size() > 0 && conf != null && conf.getTable() != null
- && (conf.getTable().getNumBuckets() > taskIDToFile.size())) {
+ && (conf.getTable().getNumBuckets() > taskIDToFile.size()) && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) {
            // get the missing buckets and generate empty buckets for non-dynamic partition
          String taskID1 = taskIDToFile.keySet().iterator().next();
          Path bucketPath = taskIDToFile.values().iterator().next().getPath();

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
index 520ae74..9bf9377 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
@@ -106,6 +106,7 @@ public class AcidUtils {
        Pattern.compile("[0-9]+_[0-9]+");

    public static final PathFilter hiddenFileFilter = new PathFilter(){
+ @Override
      public boolean accept(Path p){
        String name = p.getName();
        return !name.startsWith("_") && !name.startsWith(".");
@@ -446,7 +447,7 @@ public class AcidUtils {
        Configuration conf,
        ValidTxnList txnList
        ) throws IOException {
- return getAcidState(directory, conf, txnList, false);
+ return getAcidState(directory, conf, txnList, false, false);
    }

    /** State class for getChildState; cannot modify 2 things in a method. */
@@ -469,7 +470,8 @@ public class AcidUtils {
    public static Directory getAcidState(Path directory,
                                         Configuration conf,
                                         ValidTxnList txnList,
- boolean useFileIds
+ boolean useFileIds,
+ boolean ignoreEmptyFiles
                                         ) throws IOException {
      FileSystem fs = directory.getFileSystem(conf);
      final List<ParsedDelta> deltas = new ArrayList<ParsedDelta>();
@@ -490,13 +492,13 @@ public class AcidUtils {
      if (childrenWithId != null) {
        for (HdfsFileStatusWithId child : childrenWithId) {
          getChildState(child.getFileStatus(), child, txnList, working,
- originalDirectories, original, obsolete, bestBase);
+ originalDirectories, original, obsolete, bestBase, ignoreEmptyFiles);
        }
      } else {
        List<FileStatus> children = SHIMS.listLocatedStatus(fs, directory, hiddenFileFilter);
        for (FileStatus child : children) {
          getChildState(
- child, null, txnList, working, originalDirectories, original, obsolete, bestBase);
+ child, null, txnList, working, originalDirectories, original, obsolete, bestBase, ignoreEmptyFiles);
        }
      }

@@ -577,7 +579,7 @@ public class AcidUtils {

    private static void getChildState(FileStatus child, HdfsFileStatusWithId childWithId,
        ValidTxnList txnList, List<ParsedDelta> working, List<FileStatus> originalDirectories,
- List<HdfsFileStatusWithId> original, List<FileStatus> obsolete, TxnBase bestBase) {
+ List<HdfsFileStatusWithId> original, List<FileStatus> obsolete, TxnBase bestBase, boolean ignoreEmptyFiles) {
      Path p = child.getPath();
      String fn = p.getName();
      if (fn.startsWith(BASE_PREFIX) && child.isDir()) {
@@ -605,7 +607,7 @@ public class AcidUtils {
        // it is possible that the cleaner is running and removing these original files,
        // in which case recursing through them could cause us to get an error.
        originalDirectories.add(child);
- } else {
+ } else if (!ignoreEmptyFiles || child.getLen() != 0){
        original.add(createOriginalObj(childWithId, child));
      }
    }
@@ -616,7 +618,7 @@ public class AcidUtils {
    }

    private static class HdfsFileStatusWithoutId implements HdfsFileStatusWithId {
- private FileStatus fs;
+ private final FileStatus fs;

      public HdfsFileStatusWithoutId(FileStatus fs) {
        this.fs = fs;

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index d175d2d..0ebcd2a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -897,11 +897,13 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
        List<OrcSplit> splits = Lists.newArrayList();
        for (HdfsFileStatusWithId file : fileStatuses) {
          FileStatus fileStatus = file.getFileStatus();
- String[] hosts = SHIMS.getLocationsWithOffset(fs, fileStatus).firstEntry().getValue()
- .getHosts();
- OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), file.getFileId(), 0,
- fileStatus.getLen(), hosts, null, isOriginal, true, deltas, -1);
- splits.add(orcSplit);
+ if (fileStatus.getLen() != 0) {
+ String[] hosts = SHIMS.getLocationsWithOffset(fs, fileStatus).firstEntry().getValue()
+ .getHosts();
+ OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), file.getFileId(), 0,
+ fileStatus.getLen(), hosts, null, isOriginal, true, deltas, -1);
+ splits.add(orcSplit);
+ }
        }

        // add uncovered ACID delta splits
@@ -992,7 +994,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,

      private AcidDirInfo callInternal() throws IOException {
        AcidUtils.Directory dirInfo = AcidUtils.getAcidState(dir,
- context.conf, context.transactionList, useFileIds);
+ context.conf, context.transactionList, useFileIds, true);
        Path base = dirInfo.getBaseDirectory();
        // find the base files (original or new style)
        List<HdfsFileStatusWithId> children = (base == null)

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
index 3fb6a86..b0f8c8b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
@@ -110,23 +110,20 @@ public class OrcOutputFormat extends FileOutputFormat<NullWritable, OrcSerdeRow>

      @Override
      public void close(boolean b) throws IOException {
- // if we haven't written any rows, we need to create a file with a
- // generic schema.
        if (writer == null) {
- // a row with no columns
- ObjectInspector inspector = ObjectInspectorFactory.
- getStandardStructObjectInspector(new ArrayList<String>(),
- new ArrayList<ObjectInspector>());
- options.inspector(inspector);
- writer = OrcFile.createWriter(path, options);
+ // we are closing a file without writing any data in it
+ FileSystem fs = options.getFileSystem() == null ?
+ path.getFileSystem(options.getConfiguration()) : options.getFileSystem();
+ fs.createNewFile(path);
+ return;
        }
        writer.close();
      }

      @Override
      public SerDeStats getStats() {
- stats.setRawDataSize(writer.getRawDataSize());
- stats.setRowCount(writer.getNumberOfRows());
+ stats.setRawDataSize(null == writer ? 0 : writer.getRawDataSize());
+ stats.setRowCount(null == writer ? 0 : writer.getNumberOfRows());
        return stats;
      }
    }

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
index fea0764..f1f1db2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
@@ -150,7 +150,7 @@ public class CompactorMR {
      // and discovering that in getSplits is too late as we then have no way to pass it to our
      // mapper.

- AcidUtils.Directory dir = AcidUtils.getAcidState(new Path(sd.getLocation()), conf, txns, false);
+ AcidUtils.Directory dir = AcidUtils.getAcidState(new Path(sd.getLocation()), conf, txns, false, true);
      List<AcidUtils.ParsedDelta> parsedDeltas = dir.getCurrentDirectories();
      int maxDeltastoHandle = conf.getIntVar(HiveConf.ConfVars.COMPACTOR_MAX_NUM_DELTA);
      if(parsedDeltas.size() > maxDeltastoHandle) {

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Initiator.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Initiator.java b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Initiator.java
index 3705a34..3e22548 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Initiator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Initiator.java
@@ -228,7 +228,7 @@ public class Initiator extends CompactorThread {
      boolean noBase = false;
      Path location = new Path(sd.getLocation());
      FileSystem fs = location.getFileSystem(conf);
- AcidUtils.Directory dir = AcidUtils.getAcidState(location, conf, txns, false);
+ AcidUtils.Directory dir = AcidUtils.getAcidState(location, conf, txns, false, false);
      Path base = dir.getBaseDirectory();
      long baseSize = 0;
      FileStatus stat = null;

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 6f84708..4fafe8c 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -529,11 +529,11 @@ public class TestInputOutputFormat {
    public void testFileGenerator() throws Exception {
      OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
      MockFileSystem fs = new MockFileSystem(conf,
- new MockFile("mock:/a/b/part-00", 1000, new byte[0]),
- new MockFile("mock:/a/b/part-01", 1000, new byte[0]),
- new MockFile("mock:/a/b/_part-02", 1000, new byte[0]),
- new MockFile("mock:/a/b/.part-03", 1000, new byte[0]),
- new MockFile("mock:/a/b/part-04", 1000, new byte[0]));
+ new MockFile("mock:/a/b/part-00", 1000, new byte[1]),
+ new MockFile("mock:/a/b/part-01", 1000, new byte[1]),
+ new MockFile("mock:/a/b/_part-02", 1000, new byte[1]),
+ new MockFile("mock:/a/b/.part-03", 1000, new byte[1]),
+ new MockFile("mock:/a/b/part-04", 1000, new byte[1]));
      OrcInputFormat.FileGenerator gen =
        new OrcInputFormat.FileGenerator(context, fs,
            new MockPath(fs, "mock:/a/b"), false, null);
@@ -560,14 +560,14 @@ public class TestInputOutputFormat {
      conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_DIRECTORY_BATCH_MS.varname, "1000000");
      OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
      MockFileSystem fs = new MockFileSystem(conf,
- new MockFile("mock:/a/1/part-00", 1000, new byte[0]),
- new MockFile("mock:/a/1/part-01", 1000, new byte[0]),
- new MockFile("mock:/a/2/part-00", 1000, new byte[0]),
- new MockFile("mock:/a/2/part-01", 1000, new byte[0]),
- new MockFile("mock:/a/3/base_0/1", 1000, new byte[0]),
- new MockFile("mock:/a/4/base_0/1", 1000, new byte[0]),
- new MockFile("mock:/a/5/base_0/1", 1000, new byte[0]),
- new MockFile("mock:/a/5/delta_0_25/1", 1000, new byte[0])
+ new MockFile("mock:/a/1/part-00", 1000, new byte[1]),
+ new MockFile("mock:/a/1/part-01", 1000, new byte[1]),
+ new MockFile("mock:/a/2/part-00", 1000, new byte[1]),
+ new MockFile("mock:/a/2/part-01", 1000, new byte[1]),
+ new MockFile("mock:/a/3/base_0/1", 1000, new byte[1]),
+ new MockFile("mock:/a/4/base_0/1", 1000, new byte[1]),
+ new MockFile("mock:/a/5/base_0/1", 1000, new byte[1]),
+ new MockFile("mock:/a/5/delta_0_25/1", 1000, new byte[1])
      );

      OrcInputFormat.CombinedCtx combineCtx = new OrcInputFormat.CombinedCtx();
@@ -575,7 +575,7 @@ public class TestInputOutputFormat {
      SplitStrategy<?> ss = createOrCombineStrategy(context, fs, "mock:/a/1", combineCtx);
      assertNull(ss);
      assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
- OrcInputFormat.ETLSplitStrategy etlSs = (OrcInputFormat.ETLSplitStrategy)combineCtx.combined;
+ OrcInputFormat.ETLSplitStrategy etlSs = combineCtx.combined;
      assertEquals(2, etlSs.files.size());
      assertTrue(etlSs.isOriginal);
      assertEquals(1, etlSs.dirs.size());
@@ -591,7 +591,7 @@ public class TestInputOutputFormat {
      assertEquals(4, etlSs.files.size());
      assertEquals(2, etlSs.dirs.size());
      assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
- etlSs = (OrcInputFormat.ETLSplitStrategy)combineCtx.combined;
+ etlSs = combineCtx.combined;
      assertEquals(1, etlSs.files.size());
      assertFalse(etlSs.isOriginal);
      assertEquals(1, etlSs.dirs.size());
@@ -1478,6 +1478,7 @@ public class TestInputOutputFormat {
      org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer =
          outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
              properties, Reporter.NULL);
+ writer.write(new OrcSerde().serialize(null,null));
      writer.close(true);
      InputFormat<?,?> in = new OrcInputFormat();
      fs.setPermission(testFilePath, FsPermission.createImmutable((short) 0333));

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
index be2b61e..d03bfe4 100644
--- a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
+++ b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
@@ -1104,7 +1104,7 @@ Partition Parameters:
   numFiles 8
   numRows 6
   rawDataSize 120
- totalSize 2400
+ totalSize 2004
  #### A masked pattern was here ####

  # Storage Information
@@ -1186,7 +1186,7 @@ Partition Parameters:
   numFiles 8
   numRows 6
   rawDataSize 120
- totalSize 2400
+ totalSize 2004
  #### A masked pattern was here ####

  # Storage Information

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/test/results/clientpositive/spark/vector_outer_join1.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/vector_outer_join1.q.out b/ql/src/test/results/clientpositive/spark/vector_outer_join1.q.out
index 69d3f9c..66cc11b 100644
--- a/ql/src/test/results/clientpositive/spark/vector_outer_join1.q.out
+++ b/ql/src/test/results/clientpositive/spark/vector_outer_join1.q.out
@@ -182,11 +182,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean)
                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                      Spark HashTable Sink Operator
                        keys:
                          0 _col2 (type: int)
@@ -203,11 +203,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean)
                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -217,10 +217,10 @@ STAGE PLANS:
                        outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23
                        input vertices:
                          1 Map 2
- Statistics: Num rows: 16 Data size: 4632 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 16 Data size: 4403 Basic stats: COMPLETE Column stats: NONE
                        File Output Operator
                          compressed: false
- Statistics: Num rows: 16 Data size: 4632 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 16 Data size: 4403 Basic stats: COMPLETE Column stats: NONE
                          table:
                              input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                              output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -298,11 +298,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                      Spark HashTable Sink Operator
                        keys:
                          0 _col0 (type: tinyint)
@@ -319,11 +319,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -333,10 +333,10 @@ STAGE PLANS:
                        outputColumnNames: _col0
                        input vertices:
                          1 Map 2
- Statistics: Num rows: 16 Data size: 4632 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 16 Data size: 4403 Basic stats: COMPLETE Column stats: NONE
                        File Output Operator
                          compressed: false
- Statistics: Num rows: 16 Data size: 4632 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 16 Data size: 4403 Basic stats: COMPLETE Column stats: NONE
                          table:
                              input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                              output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -506,11 +506,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: cint (type: int)
                      outputColumnNames: _col0
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                      Spark HashTable Sink Operator
                        keys:
                          0 _col1 (type: int)
@@ -522,11 +522,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                      Spark HashTable Sink Operator
                        keys:
                          0 _col0 (type: tinyint)
@@ -545,11 +545,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), cint (type: int)
                      outputColumnNames: _col0, _col1
- Statistics: Num rows: 15 Data size: 4211 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 4003 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -559,7 +559,7 @@ STAGE PLANS:
                        outputColumnNames: _col0
                        input vertices:
                          1 Map 3
- Statistics: Num rows: 16 Data size: 4632 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 16 Data size: 4403 Basic stats: COMPLETE Column stats: NONE
                        Map Join Operator
                          condition map:
                               Left Outer Join0 to 1
@@ -569,7 +569,7 @@ STAGE PLANS:
                          outputColumnNames: _col0
                          input vertices:
                            1 Map 4
- Statistics: Num rows: 17 Data size: 5095 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 17 Data size: 4843 Basic stats: COMPLETE Column stats: NONE
                          Group By Operator
                            aggregations: count(), sum(_col0)
                            mode: hash

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/test/results/clientpositive/spark/vector_outer_join4.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/vector_outer_join4.q.out b/ql/src/test/results/clientpositive/spark/vector_outer_join4.q.out
index c42ec7e..e64ea65 100644
--- a/ql/src/test/results/clientpositive/spark/vector_outer_join4.q.out
+++ b/ql/src/test/results/clientpositive/spark/vector_outer_join4.q.out
@@ -212,11 +212,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean)
                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                      Spark HashTable Sink Operator
                        keys:
                          0 _col2 (type: int)
@@ -233,11 +233,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean)
                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -247,10 +247,10 @@ STAGE PLANS:
                        outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23
                        input vertices:
                          1 Map 2
- Statistics: Num rows: 33 Data size: 5054 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 33 Data size: 4825 Basic stats: COMPLETE Column stats: NONE
                        File Output Operator
                          compressed: false
- Statistics: Num rows: 33 Data size: 5054 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 33 Data size: 4825 Basic stats: COMPLETE Column stats: NONE
                          table:
                              input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                              output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -363,11 +363,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                      Spark HashTable Sink Operator
                        keys:
                          0 _col0 (type: tinyint)
@@ -384,11 +384,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -398,10 +398,10 @@ STAGE PLANS:
                        outputColumnNames: _col0
                        input vertices:
                          1 Map 2
- Statistics: Num rows: 33 Data size: 5054 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 33 Data size: 4825 Basic stats: COMPLETE Column stats: NONE
                        File Output Operator
                          compressed: false
- Statistics: Num rows: 33 Data size: 5054 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 33 Data size: 4825 Basic stats: COMPLETE Column stats: NONE
                          table:
                              input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                              output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -876,11 +876,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: cint (type: int)
                      outputColumnNames: _col0
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                      Spark HashTable Sink Operator
                        keys:
                          0 _col1 (type: int)
@@ -892,11 +892,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                      Spark HashTable Sink Operator
                        keys:
                          0 _col0 (type: tinyint)
@@ -915,11 +915,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), cint (type: int)
                      outputColumnNames: _col0, _col1
- Statistics: Num rows: 30 Data size: 4595 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 4387 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -929,7 +929,7 @@ STAGE PLANS:
                        outputColumnNames: _col0
                        input vertices:
                          1 Map 3
- Statistics: Num rows: 33 Data size: 5054 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 33 Data size: 4825 Basic stats: COMPLETE Column stats: NONE
                        Map Join Operator
                          condition map:
                               Left Outer Join0 to 1
@@ -938,7 +938,7 @@ STAGE PLANS:
                            1 _col0 (type: tinyint)
                          input vertices:
                            1 Map 4
- Statistics: Num rows: 36 Data size: 5559 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 36 Data size: 5307 Basic stats: COMPLETE Column stats: NONE
                          Group By Operator
                            aggregations: count()
                            mode: hash

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
index 79558d5..a90e3f6 100644
--- a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
+++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
@@ -1161,10 +1161,10 @@ Table: over1k_part_buck_orc
  #### A masked pattern was here ####
  Partition Parameters:
   COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
- numFiles 8
+ numFiles 4
   numRows 6
   rawDataSize 120
- totalSize 2400
+ totalSize 2004
  #### A masked pattern was here ####

  # Storage Information
@@ -1243,10 +1243,10 @@ Table: over1k_part_buck_sort_orc
  #### A masked pattern was here ####
  Partition Parameters:
   COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
- numFiles 8
+ numFiles 4
   numRows 6
   rawDataSize 120
- totalSize 2400
+ totalSize 2004
  #### A masked pattern was here ####

  # Storage Information

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
index fbeea6b..5292106 100644
--- a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
+++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
@@ -1074,7 +1074,7 @@ Table: over1k_part_buck
  #### A masked pattern was here ####
  Partition Parameters:
   COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
- numFiles 8
+ numFiles 4
   numRows 6
   rawDataSize 156
   totalSize 162
@@ -1156,7 +1156,7 @@ Table: over1k_part_buck_sort
  #### A masked pattern was here ####
  Partition Parameters:
   COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
- numFiles 8
+ numFiles 4
   numRows 6
   rawDataSize 156
   totalSize 162

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out b/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out
index 41c0d71..46527d6 100644
--- a/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out
+++ b/ql/src/test/results/clientpositive/tez/union_fast_stats.q.out
@@ -120,7 +120,7 @@ Table Parameters:
   numFiles 4
   numRows 0
   rawDataSize 0
- totalSize 4211
+ totalSize 4003
  #### A masked pattern was here ####

  # Storage Information
@@ -172,8 +172,8 @@ Table Parameters:
   COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
   numFiles 4
   numRows 15
- rawDataSize 3651
- totalSize 4211
+ rawDataSize 3483
+ totalSize 4003
  #### A masked pattern was here ####

  # Storage Information
@@ -237,8 +237,8 @@ Table Parameters:
   COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
   numFiles 5
   numRows 20
- rawDataSize 4720
- totalSize 5568
+ rawDataSize 4552
+ totalSize 5360
  #### A masked pattern was here ####

  # Storage Information

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out b/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out
index d962621..4e2e62c 100644
--- a/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out
+++ b/ql/src/test/results/clientpositive/tez/vector_outer_join1.q.out
@@ -184,11 +184,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean)
                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -198,11 +198,11 @@ STAGE PLANS:
                        outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23
                        input vertices:
                          1 Map 2
- Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE
                        HybridGraceHashJoin: true
                        File Output Operator
                          compressed: false
- Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE
                          table:
                              input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                              output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -212,16 +212,16 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean)
                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                      Reduce Output Operator
                        key expressions: _col2 (type: int)
                        sort order: +
                        Map-reduce partition columns: _col2 (type: int)
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                        value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean)
              Execution mode: vectorized

@@ -296,11 +296,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -310,11 +310,11 @@ STAGE PLANS:
                        outputColumnNames: _col0
                        input vertices:
                          1 Map 2
- Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE
                        HybridGraceHashJoin: true
                        File Output Operator
                          compressed: false
- Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE
                          table:
                              input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                              output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -324,16 +324,16 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                      Reduce Output Operator
                        key expressions: _col0 (type: tinyint)
                        sort order: +
                        Map-reduce partition columns: _col0 (type: tinyint)
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
              Execution mode: vectorized

    Stage: Stage-0
@@ -500,11 +500,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), cint (type: int)
                      outputColumnNames: _col0, _col1
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -514,7 +514,7 @@ STAGE PLANS:
                        outputColumnNames: _col0
                        input vertices:
                          1 Map 3
- Statistics: Num rows: 16 Data size: 4016 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 16 Data size: 3831 Basic stats: COMPLETE Column stats: NONE
                        HybridGraceHashJoin: true
                        Map Join Operator
                          condition map:
@@ -525,7 +525,7 @@ STAGE PLANS:
                          outputColumnNames: _col0
                          input vertices:
                            1 Map 4
- Statistics: Num rows: 17 Data size: 4417 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 17 Data size: 4214 Basic stats: COMPLETE Column stats: NONE
                          HybridGraceHashJoin: true
                          Group By Operator
                            aggregations: count(), sum(_col0)
@@ -541,31 +541,31 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: cint (type: int)
                      outputColumnNames: _col0
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                      Reduce Output Operator
                        key expressions: _col0 (type: int)
                        sort order: +
                        Map-reduce partition columns: _col0 (type: int)
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
              Execution mode: vectorized
          Map 4
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
                      Reduce Output Operator
                        key expressions: _col0 (type: tinyint)
                        sort order: +
                        Map-reduce partition columns: _col0 (type: tinyint)
- Statistics: Num rows: 15 Data size: 3651 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 15 Data size: 3483 Basic stats: COMPLETE Column stats: NONE
              Execution mode: vectorized
          Reducer 2
              Execution mode: vectorized

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out b/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out
index 9db8e00..a6690b6 100644
--- a/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out
+++ b/ql/src/test/results/clientpositive/tez/vector_outer_join4.q.out
@@ -214,11 +214,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean)
                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -228,11 +228,11 @@ STAGE PLANS:
                        outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23
                        input vertices:
                          1 Map 2
- Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE
                        HybridGraceHashJoin: true
                        File Output Operator
                          compressed: false
- Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE
                          table:
                              input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                              output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -242,16 +242,16 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean)
                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                      Reduce Output Operator
                        key expressions: _col2 (type: int)
                        sort order: +
                        Map-reduce partition columns: _col2 (type: int)
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                        value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean)
              Execution mode: vectorized

@@ -361,11 +361,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -375,11 +375,11 @@ STAGE PLANS:
                        outputColumnNames: _col0
                        input vertices:
                          1 Map 2
- Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE
                        HybridGraceHashJoin: true
                        File Output Operator
                          compressed: false
- Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE
                          table:
                              input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                              output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -389,16 +389,16 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                      Reduce Output Operator
                        key expressions: _col0 (type: tinyint)
                        sort order: +
                        Map-reduce partition columns: _col0 (type: tinyint)
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
              Execution mode: vectorized

    Stage: Stage-0
@@ -870,11 +870,11 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint), cint (type: int)
                      outputColumnNames: _col0, _col1
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                      Map Join Operator
                        condition map:
                             Left Outer Join0 to 1
@@ -884,7 +884,7 @@ STAGE PLANS:
                        outputColumnNames: _col0
                        input vertices:
                          1 Map 3
- Statistics: Num rows: 33 Data size: 7706 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 33 Data size: 7521 Basic stats: COMPLETE Column stats: NONE
                        HybridGraceHashJoin: true
                        Map Join Operator
                          condition map:
@@ -894,7 +894,7 @@ STAGE PLANS:
                            1 _col0 (type: tinyint)
                          input vertices:
                            1 Map 4
- Statistics: Num rows: 36 Data size: 8476 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 36 Data size: 8273 Basic stats: COMPLETE Column stats: NONE
                          HybridGraceHashJoin: true
                          Group By Operator
                            aggregations: count()
@@ -910,31 +910,31 @@ STAGE PLANS:
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: cint (type: int)
                      outputColumnNames: _col0
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                      Reduce Output Operator
                        key expressions: _col0 (type: int)
                        sort order: +
                        Map-reduce partition columns: _col0 (type: int)
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
              Execution mode: vectorized
          Map 4
              Map Operator Tree:
                  TableScan
                    alias: c
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                    Select Operator
                      expressions: ctinyint (type: tinyint)
                      outputColumnNames: _col0
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
                      Reduce Output Operator
                        key expressions: _col0 (type: tinyint)
                        sort order: +
                        Map-reduce partition columns: _col0 (type: tinyint)
- Statistics: Num rows: 30 Data size: 7006 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 30 Data size: 6838 Basic stats: COMPLETE Column stats: NONE
              Execution mode: vectorized
          Reducer 2
              Execution mode: vectorized

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/ql/src/test/results/clientpositive/union_fast_stats.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/union_fast_stats.q.out b/ql/src/test/results/clientpositive/union_fast_stats.q.out
index a02ff04..e908ec0 100644
--- a/ql/src/test/results/clientpositive/union_fast_stats.q.out
+++ b/ql/src/test/results/clientpositive/union_fast_stats.q.out
@@ -117,10 +117,10 @@ Retention: 0
  Table Type: MANAGED_TABLE
  Table Parameters:
   COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
- numFiles 4
+ numFiles 3
   numRows 15
   rawDataSize 3483
- totalSize 4211
+ totalSize 4003
  #### A masked pattern was here ####

  # Storage Information
@@ -170,10 +170,10 @@ Retention: 0
  Table Type: MANAGED_TABLE
  Table Parameters:
   COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
- numFiles 4
+ numFiles 3
   numRows 15
- rawDataSize 3651
- totalSize 4211
+ rawDataSize 3483
+ totalSize 4003
  #### A masked pattern was here ####

  # Storage Information
@@ -235,10 +235,10 @@ Retention: 0
  Table Type: MANAGED_TABLE
  Table Parameters:
   COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
- numFiles 5
+ numFiles 4
   numRows 20
- rawDataSize 4720
- totalSize 5568
+ rawDataSize 4552
+ totalSize 5360
  #### A masked pattern was here ####

  # Storage Information

http://git-wip-us.apache.org/repos/asf/hive/blob/7e3605dd/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
----------------------------------------------------------------------
diff --git a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
index 31060a2..9a3a31c 100644
--- a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
+++ b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
@@ -164,7 +164,7 @@ public class Hadoop23Shims extends HadoopShimsSecure {
          Iterator<FileStatus> it = result.iterator();
          while (it.hasNext()) {
            FileStatus stat = it.next();
- if (!stat.isFile()) {
+ if (!stat.isFile() || (stat.getLen() == 0 && !stat.getPath().toUri().getScheme().equals("nullscan"))) {
              it.remove();
            }
          }

Search Discussions

Related Discussions

Discussion Navigation
viewthread | post
Discussion Overview
groupcommits @
categorieshive, hadoop
postedFeb 28, '16 at 8:43a
activeFeb 28, '16 at 8:43a
posts1
users1
websitehive.apache.org

1 user in discussion

Hashutosh: 1 post

People

Translate

site design / logo © 2021 Grokbase