FAQ
Repository: hive
Updated Branches:
   refs/heads/master 4588c6076 -> 15220e8b5


HIVE-13291: ORC BI Split strategy should consider block size instead of file size (Prasanth Jayachandran reviewed by Gopal V)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/15220e8b
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/15220e8b
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/15220e8b

Branch: refs/heads/master
Commit: 15220e8b52bf934500ff8d98a131ae1059cfe6dc
Parents: 4588c60
Author: Prasanth Jayachandran <j.prasanth.j@gmail.com>
Authored: Mon Mar 21 12:31:52 2016 -0500
Committer: Prasanth Jayachandran <j.prasanth.j@gmail.com>
Committed: Mon Mar 21 12:31:52 2016 -0500

----------------------------------------------------------------------
  .../hadoop/hive/ql/io/orc/OrcInputFormat.java | 14 +--
  .../hive/ql/io/orc/TestInputOutputFormat.java | 95 ++++++++++++++++++++
  2 files changed, 102 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/15220e8b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 8b611bb..fe0be7b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -891,7 +891,6 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
      private final boolean isOriginal;
      private final List<DeltaMetaData> deltas;
      private final FileSystem fs;
- private final Context context;
      private final Path dir;
      private final boolean allowSyntheticFileIds;

@@ -899,7 +898,6 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
          Path dir, List<HdfsFileStatusWithId> fileStatuses, boolean isOriginal,
          List<DeltaMetaData> deltas, boolean[] covered, boolean allowSyntheticFileIds) {
        super(dir, context.numBuckets, deltas, covered);
- this.context = context;
        this.fileStatuses = fileStatuses;
        this.isOriginal = isOriginal;
        this.deltas = deltas;
@@ -914,15 +912,17 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
        for (HdfsFileStatusWithId file : fileStatuses) {
          FileStatus fileStatus = file.getFileStatus();
          if (fileStatus.getLen() != 0) {
- String[] hosts = SHIMS.getLocationsWithOffset(fs, fileStatus).firstEntry().getValue()
- .getHosts();
            Object fileKey = file.getFileId();
            if (fileKey == null && allowSyntheticFileIds) {
              fileKey = new SyntheticFileId(fileStatus);
            }
- OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, 0,
- fileStatus.getLen(), hosts, null, isOriginal, true, deltas, -1);
- splits.add(orcSplit);
+ TreeMap<Long, BlockLocation> blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus);
+ for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) {
+ OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(),
+ entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true,
+ deltas, -1);
+ splits.add(orcSplit);
+ }
          }
        }


http://git-wip-us.apache.org/repos/asf/hive/blob/15220e8b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 1a64f3a..c88f6d8 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -555,6 +555,101 @@ public class TestInputOutputFormat {
    }

    @Test
+ public void testBIStrategySplitBlockBoundary() throws Exception {
+ conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI");
+ OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
+ MockFileSystem fs = new MockFileSystem(conf,
+ new MockFile("mock:/a/b/part-00", 1000, new byte[1], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-01", 1000, new byte[1], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-02", 1000, new byte[1], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-03", 1000, new byte[1], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-04", 1000, new byte[1], new MockBlock("host1", "host2")));
+ OrcInputFormat.FileGenerator gen =
+ new OrcInputFormat.FileGenerator(context, fs,
+ new MockPath(fs, "mock:/a/b"), false, null);
+ OrcInputFormat.SplitStrategy splitStrategy = createSplitStrategy(context, gen);
+ assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
+ List<OrcSplit> splits = splitStrategy.getSplits();
+ int numSplits = splits.size();
+ assertEquals(5, numSplits);
+
+ context = new OrcInputFormat.Context(conf);
+ fs = new MockFileSystem(conf,
+ new MockFile("mock:/a/b/part-00", 1000, new byte[1000], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-01", 1000, new byte[1000], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-02", 1000, new byte[1000], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-03", 1000, new byte[1000], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-04", 1000, new byte[1000], new MockBlock("host1", "host2")));
+ gen = new OrcInputFormat.FileGenerator(context, fs,
+ new MockPath(fs, "mock:/a/b"), false, null);
+ splitStrategy = createSplitStrategy(context, gen);
+ assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
+ splits = splitStrategy.getSplits();
+ numSplits = splits.size();
+ assertEquals(5, numSplits);
+
+ context = new OrcInputFormat.Context(conf);
+ fs = new MockFileSystem(conf,
+ new MockFile("mock:/a/b/part-00", 1000, new byte[1100], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-01", 1000, new byte[1100], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-02", 1000, new byte[1100], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-03", 1000, new byte[1100], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-04", 1000, new byte[1100], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")));
+ gen = new OrcInputFormat.FileGenerator(context, fs,
+ new MockPath(fs, "mock:/a/b"), false, null);
+ splitStrategy = createSplitStrategy(context, gen);
+ assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
+ splits = splitStrategy.getSplits();
+ numSplits = splits.size();
+ assertEquals(10, numSplits);
+
+ context = new OrcInputFormat.Context(conf);
+ fs = new MockFileSystem(conf,
+ new MockFile("mock:/a/b/part-00", 1000, new byte[2000], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-01", 1000, new byte[2000], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-02", 1000, new byte[2000], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-03", 1000, new byte[2000], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-04", 1000, new byte[2000], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")));
+ gen = new OrcInputFormat.FileGenerator(context, fs,
+ new MockPath(fs, "mock:/a/b"), false, null);
+ splitStrategy = createSplitStrategy(context, gen);
+ assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
+ splits = splitStrategy.getSplits();
+ numSplits = splits.size();
+ assertEquals(10, numSplits);
+
+ context = new OrcInputFormat.Context(conf);
+ fs = new MockFileSystem(conf,
+ new MockFile("mock:/a/b/part-00", 1000, new byte[2200], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-01", 1000, new byte[2200], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-02", 1000, new byte[2200], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-03", 1000, new byte[2200], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-04", 1000, new byte[2200], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2"), new MockBlock("host1", "host2")));
+ gen = new OrcInputFormat.FileGenerator(context, fs,
+ new MockPath(fs, "mock:/a/b"), false, null);
+ splitStrategy = createSplitStrategy(context, gen);
+ assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
+ splits = splitStrategy.getSplits();
+ numSplits = splits.size();
+ assertEquals(15, numSplits);
+ }
+
+ @Test
    public void testEtlCombinedStrategy() throws Exception {
      conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL");
      conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_DIRECTORY_BATCH_MS.varname, "1000000");

Search Discussions

  • Prasanthj at Mar 23, 2016 at 6:53 am
    Repository: hive
    Updated Branches:
       refs/heads/branch-1 908e600ef -> 467e292ba


    HIVE-13291: ORC BI Split strategy should consider block size instead of file size (Prasanth Jayachandran reviewed by Gopal V)


    Project: http://git-wip-us.apache.org/repos/asf/hive/repo
    Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/467e292b
    Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/467e292b
    Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/467e292b

    Branch: refs/heads/branch-1
    Commit: 467e292ba12054dcb4e4b637b880a979e7c256e5
    Parents: 908e600
    Author: Prasanth Jayachandran <prasanthj@apache.org>
    Authored: Wed Mar 23 01:52:31 2016 -0500
    Committer: Prasanth Jayachandran <prasanthj@apache.org>
    Committed: Wed Mar 23 01:52:31 2016 -0500

    ----------------------------------------------------------------------
      .../hadoop/hive/ql/io/orc/OrcInputFormat.java | 14 +--
      .../hive/ql/io/orc/TestInputOutputFormat.java | 90 ++++++++++++++++++++
      2 files changed, 97 insertions(+), 7 deletions(-)
    ----------------------------------------------------------------------


    http://git-wip-us.apache.org/repos/asf/hive/blob/467e292b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
    ----------------------------------------------------------------------
    diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
    index 47e8b34..2d6ef9a 100644
    --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
    +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
    @@ -608,14 +608,12 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
          boolean isOriginal;
          List<DeltaMetaData> deltas;
          FileSystem fs;
    - Context context;
          Path dir;

          public BISplitStrategy(Context context, FileSystem fs,
              Path dir, List<FileStatus> fileStatuses, boolean isOriginal,
              List<DeltaMetaData> deltas, boolean[] covered) {
            super(dir, context.numBuckets, deltas, covered);
    - this.context = context;
            this.fileStatuses = fileStatuses;
            this.isOriginal = isOriginal;
            this.deltas = deltas;
    @@ -627,11 +625,13 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
          public List<OrcSplit> getSplits() throws IOException {
            List<OrcSplit> splits = Lists.newArrayList();
            for (FileStatus fileStatus : fileStatuses) {
    - String[] hosts = SHIMS.getLocationsWithOffset(fs, fileStatus).firstEntry().getValue()
    - .getHosts();
    - OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), 0, fileStatus.getLen(), hosts,
    - null, isOriginal, true, deltas, -1);
    - splits.add(orcSplit);
    + TreeMap<Long, BlockLocation> blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus);
    + for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) {
    + OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), entry.getKey(),
    + entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true,
    + deltas, -1);
    + splits.add(orcSplit);
    + }
            }

            // add uncovered ACID delta splits

    http://git-wip-us.apache.org/repos/asf/hive/blob/467e292b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
    ----------------------------------------------------------------------
    diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
    index c0fcedc..a345884 100644
    --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
    +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
    @@ -540,6 +540,96 @@ public class TestInputOutputFormat {

        }

    + @Test
    + public void testBIStrategySplitBlockBoundary() throws Exception {
    + conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI");
    + OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
    + MockFileSystem fs = new MockFileSystem(conf,
    + new MockFile("mock:/a/b/part-00", 1000, new byte[1], new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-01", 1000, new byte[1], new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-02", 1000, new byte[1], new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-03", 1000, new byte[1], new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-04", 1000, new byte[1], new MockBlock("host1", "host2")));
    + OrcInputFormat.FileGenerator gen =
    + new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"));
    + OrcInputFormat.SplitStrategy splitStrategy = gen.call();
    + assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
    + List<OrcSplit> splits = splitStrategy.getSplits();
    + int numSplits = splits.size();
    + assertEquals(5, numSplits);
    +
    + context = new OrcInputFormat.Context(conf);
    + fs = new MockFileSystem(conf,
    + new MockFile("mock:/a/b/part-00", 1000, new byte[1000], new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-01", 1000, new byte[1000], new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-02", 1000, new byte[1000], new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-03", 1000, new byte[1000], new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-04", 1000, new byte[1000], new MockBlock("host1", "host2")));
    + gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"));
    + splitStrategy = gen.call();
    + assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
    + splits = splitStrategy.getSplits();
    + numSplits = splits.size();
    + assertEquals(5, numSplits);
    +
    + context = new OrcInputFormat.Context(conf);
    + fs = new MockFileSystem(conf,
    + new MockFile("mock:/a/b/part-00", 1000, new byte[1100], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-01", 1000, new byte[1100], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-02", 1000, new byte[1100], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-03", 1000, new byte[1100], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-04", 1000, new byte[1100], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2")));
    + gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"));
    + splitStrategy = gen.call();
    + assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
    + splits = splitStrategy.getSplits();
    + numSplits = splits.size();
    + assertEquals(10, numSplits);
    +
    + context = new OrcInputFormat.Context(conf);
    + fs = new MockFileSystem(conf,
    + new MockFile("mock:/a/b/part-00", 1000, new byte[2000], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-01", 1000, new byte[2000], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-02", 1000, new byte[2000], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-03", 1000, new byte[2000], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-04", 1000, new byte[2000], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2")));
    + gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"));
    + splitStrategy = gen.call();
    + assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
    + splits = splitStrategy.getSplits();
    + numSplits = splits.size();
    + assertEquals(10, numSplits);
    +
    + context = new OrcInputFormat.Context(conf);
    + fs = new MockFileSystem(conf,
    + new MockFile("mock:/a/b/part-00", 1000, new byte[2200], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-01", 1000, new byte[2200], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-02", 1000, new byte[2200], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-03", 1000, new byte[2200], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
    + new MockFile("mock:/a/b/part-04", 1000, new byte[2200], new MockBlock("host1", "host2"),
    + new MockBlock("host1", "host2"), new MockBlock("host1", "host2")));
    + gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"));
    + splitStrategy = gen.call();
    + assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
    + splits = splitStrategy.getSplits();
    + numSplits = splits.size();
    + assertEquals(15, numSplits);
    + }
    +
        public static class MockBlock {
          int offset;
          int length;

Related Discussions

Discussion Navigation
viewthread | post
Discussion Overview
groupcommits @
categorieshive, hadoop
postedMar 21, '16 at 5:32p
activeMar 23, '16 at 6:53a
posts2
users1
websitehive.apache.org

1 user in discussion

Prasanthj: 2 posts

People

Translate

site design / logo © 2021 Grokbase