FAQ
Author: gates
Date: Fri Dec 5 21:13:34 2014
New Revision: 1643436

URL: http://svn.apache.org/r1643436
Log:
HIVE-7896 orcfiledump should be able to dump data (Alan Gates, reviewed by Prasanth Jayachandran)

Modified:
     hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
     hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java?rev=1643436&r1=1643435&r2=1643436&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java Fri Dec 5 21:13:34 2014
@@ -17,20 +17,34 @@
   */
  package org.apache.hadoop.hive.ql.io.orc;

+import java.io.OutputStreamWriter;
  import java.util.ArrayList;
+import java.util.Arrays;
  import java.util.List;

  import java.io.IOException;
  import java.text.DecimalFormat;
-import java.util.List;
+import java.util.Map;

+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
  import org.apache.hadoop.conf.Configuration;
  import org.apache.hadoop.fs.FileSystem;
  import org.apache.hadoop.fs.Path;
  import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndex;
  import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry;
-import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
+import org.apache.hadoop.hive.serde2.io.ByteWritable;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.io.ShortWritable;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONWriter;

  /**
   * A tool for printing out the file structure of ORC files.
@@ -43,24 +57,40 @@ public final class FileDump {

    public static void main(String[] args) throws Exception {
      Configuration conf = new Configuration();
- List<String> files = new ArrayList<String>();
+
      List<Integer> rowIndexCols = null;
- for (String arg : args) {
- if (arg.startsWith("--")) {
- if (arg.startsWith(ROWINDEX_PREFIX)) {
- String[] colStrs = arg.substring(ROWINDEX_PREFIX.length()).split(",");
- rowIndexCols = new ArrayList<Integer>(colStrs.length);
- for (String colStr : colStrs) {
- rowIndexCols.add(Integer.parseInt(colStr));
- }
- } else {
- System.err.println("Unknown argument " + arg);
- }
- } else {
- files.add(arg);
+ Options opts = createOptions();
+ CommandLine cli = new GnuParser().parse(opts, args);
+
+ if (cli.hasOption('h')) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("orcfiledump", opts);
+ return;
+ }
+
+ boolean dumpData = cli.hasOption('d');
+ if (cli.hasOption("rowindex")) {
+ String[] colStrs = cli.getOptionValue("rowindex").split(",");
+ rowIndexCols = new ArrayList<Integer>(colStrs.length);
+ for (String colStr : colStrs) {
+ rowIndexCols.add(Integer.parseInt(colStr));
        }
      }

+ String[] files = cli.getArgs();
+ if (dumpData) printData(Arrays.asList(files), conf);
+ else printMetaData(Arrays.asList(files), conf, rowIndexCols);
+ }
+
+ private static void printData(List<String> files, Configuration conf) throws IOException,
+ JSONException {
+ for (String file : files) {
+ printJsonData(conf, file);
+ }
+ }
+
+ private static void printMetaData(List<String> files, Configuration conf,
+ List<Integer> rowIndexCols) throws IOException {
      for (String filename : files) {
        System.out.println("Structure for " + filename);
        Path path = new Path(filename);
@@ -181,4 +211,149 @@ public final class FileDump {
      }
      return paddedBytes;
    }
+
+ static Options createOptions() {
+ Options result = new Options();
+
+ // add -d and --data to print the rows
+ result.addOption(OptionBuilder
+ .withLongOpt("data")
+ .withDescription("Should the data be printed")
+ .create('d'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("help")
+ .withDescription("print help message")
+ .create('h'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("rowindex")
+ .withArgName("comma separated list of column ids for which row index should be printed")
+ .withDescription("Dump stats for column number(s)")
+ .hasArg()
+ .create());
+
+
+ return result;
+ }
+
+ private static void printMap(JSONWriter writer,
+ Map<Object, Object> obj,
+ List<OrcProto.Type> types,
+ OrcProto.Type type
+ ) throws IOException, JSONException {
+ writer.array();
+ int keyType = type.getSubtypes(0);
+ int valueType = type.getSubtypes(1);
+ for(Map.Entry<Object,Object> item: obj.entrySet()) {
+ writer.object();
+ writer.key("_key");
+ printObject(writer, item.getKey(), types, keyType);
+ writer.key("_value");
+ printObject(writer, item.getValue(), types, valueType);
+ writer.endObject();
+ }
+ writer.endArray();
+ }
+
+ private static void printList(JSONWriter writer,
+ List<Object> obj,
+ List<OrcProto.Type> types,
+ OrcProto.Type type
+ ) throws IOException, JSONException {
+ int subtype = type.getSubtypes(0);
+ writer.array();
+ for(Object item: obj) {
+ printObject(writer, item, types, subtype);
+ }
+ writer.endArray();
+ }
+
+ private static void printUnion(JSONWriter writer,
+ OrcUnion obj,
+ List<OrcProto.Type> types,
+ OrcProto.Type type
+ ) throws IOException, JSONException {
+ int subtype = type.getSubtypes(obj.getTag());
+ printObject(writer, obj.getObject(), types, subtype);
+ }
+
+ static void printStruct(JSONWriter writer,
+ OrcStruct obj,
+ List<OrcProto.Type> types,
+ OrcProto.Type type) throws IOException, JSONException {
+ writer.object();
+ List<Integer> fieldTypes = type.getSubtypesList();
+ for(int i=0; i < fieldTypes.size(); ++i) {
+ writer.key(type.getFieldNames(i));
+ printObject(writer, obj.getFieldValue(i), types, fieldTypes.get(i));
+ }
+ writer.endObject();
+ }
+
+ static void printObject(JSONWriter writer,
+ Object obj,
+ List<OrcProto.Type> types,
+ int typeId) throws IOException, JSONException {
+ OrcProto.Type type = types.get(typeId);
+ if (obj == null) {
+ writer.value(null);
+ } else {
+ switch (type.getKind()) {
+ case STRUCT:
+ printStruct(writer, (OrcStruct) obj, types, type);
+ break;
+ case UNION:
+ printUnion(writer, (OrcUnion) obj, types, type);
+ break;
+ case LIST:
+ printList(writer, (List<Object>) obj, types, type);
+ break;
+ case MAP:
+ printMap(writer, (Map<Object, Object>) obj, types, type);
+ break;
+ case BYTE:
+ writer.value(((ByteWritable) obj).get());
+ break;
+ case SHORT:
+ writer.value(((ShortWritable) obj).get());
+ break;
+ case INT:
+ writer.value(((IntWritable) obj).get());
+ break;
+ case LONG:
+ writer.value(((LongWritable) obj).get());
+ break;
+ case FLOAT:
+ writer.value(((FloatWritable) obj).get());
+ break;
+ case DOUBLE:
+ writer.value(((DoubleWritable) obj).get());
+ break;
+ case BOOLEAN:
+ writer.value(((BooleanWritable) obj).get());
+ break;
+ default:
+ writer.value(obj.toString());
+ break;
+ }
+ }
+ }
+
+ static void printJsonData(Configuration conf,
+ String filename) throws IOException, JSONException {
+ Path path = new Path(filename);
+ Reader reader = OrcFile.createReader(path.getFileSystem(conf), path);
+ OutputStreamWriter out = new OutputStreamWriter(System.out, "UTF-8");
+ RecordReader rows = reader.rows(null);
+ Object row = null;
+ List<OrcProto.Type> types = reader.getTypes();
+ while (rows.hasNext()) {
+ row = rows.next(row);
+ JSONWriter writer = new JSONWriter(out);
+ printObject(writer, row, types, 0);
+ out.write("\n");
+ out.flush();
+ }
+ }
  }

Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java?rev=1643436&r1=1643435&r2=1643436&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java Fri Dec 5 21:13:34 2014
@@ -22,19 +22,32 @@ import static org.junit.Assert.assertEqu
  import static org.junit.Assert.assertNull;

  import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
  import java.io.File;
  import java.io.FileOutputStream;
  import java.io.FileReader;
  import java.io.PrintStream;
+import java.math.BigDecimal;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
  import java.util.Random;

+import junit.framework.Assert;
  import org.apache.hadoop.conf.Configuration;
  import org.apache.hadoop.fs.FileSystem;
  import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveChar;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.common.type.HiveVarchar;
  import org.apache.hadoop.hive.conf.HiveConf;
  import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
  import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
  import org.apache.hive.common.util.HiveTestUtils;
+import org.fusesource.leveldbjni.All;
  import org.junit.Before;
  import org.junit.Test;

@@ -65,6 +78,55 @@ public class TestFileDump {
      }
    }

+ static class AllTypesRecord {
+ static class Struct {
+ int i;
+ String s;
+
+ Struct(int i, String s) {
+ this.i = i;
+ this.s = s;
+ }
+ }
+ boolean b;
+ byte bt;
+ short s;
+ int i;
+ long l;
+ float f;
+ double d;
+ HiveDecimal de;
+ Timestamp t;
+ Date dt;
+ String str;
+ HiveChar c;
+ HiveVarchar vc;
+ Map<String, String> m;
+ List<Integer> a;
+ Struct st;
+
+ AllTypesRecord(boolean b, byte bt, short s, int i, long l, float f, double d, HiveDecimal de,
+ Timestamp t, Date dt, String str, HiveChar c, HiveVarchar vc, Map<String,
+ String> m, List<Integer> a, Struct st) {
+ this.b = b;
+ this.bt = bt;
+ this.s = s;
+ this.i = i;
+ this.l = l;
+ this.f = f;
+ this.d = d;
+ this.de = de;
+ this.t = t;
+ this.dt = dt;
+ this.str = str;
+ this.c = c;
+ this.vc = vc;
+ this.m = m;
+ this.a = a;
+ this.st = st;
+ }
+ }
+
    private static void checkOutput(String expected,
                                    String actual) throws Exception {
      BufferedReader eStream =
@@ -124,6 +186,72 @@ public class TestFileDump {
      checkOutput(outputFilename, workDir + File.separator + outputFilename);
    }

+ @Test
+ public void testDataDump() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector
+ (AllTypesRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
+ 100000, CompressionKind.NONE, 10000, 1000);
+ Map<String, String> m = new HashMap<String, String>(2);
+ m.put("k1", "v1");
+ writer.addRow(new AllTypesRecord(
+ true,
+ (byte) 10,
+ (short) 100,
+ 1000,
+ 10000L,
+ 4.0f,
+ 20.0,
+ HiveDecimal.create(new BigDecimal(4.2222)),
+ new Timestamp(1416967764000L),
+ new Date(1416967764000L),
+ "string",
+ new HiveChar("hello", 5),
+ new HiveVarchar("hello", 10),
+ m,
+ Arrays.asList(100, 200),
+ new AllTypesRecord.Struct(10, "foo")));
+ m.clear();
+ m.put("k3", "v3");
+ writer.addRow(new AllTypesRecord(
+ false,
+ (byte)20,
+ (short)200,
+ 2000,
+ 20000L,
+ 8.0f,
+ 40.0,
+ HiveDecimal.create(new BigDecimal(2.2222)),
+ new Timestamp(1416967364000L),
+ new Date(1411967764000L),
+ "abcd",
+ new HiveChar("world", 5),
+ new HiveVarchar("world", 10),
+ m,
+ Arrays.asList(200, 300),
+ new AllTypesRecord.Struct(20, "bar")));
+
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump.out";
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "-d"});
+ System.out.flush();
+ System.setOut(origOut);
+
+ String[] lines = myOut.toString().split("\n");
+ // Don't be fooled by the big space in the middle, this line is quite long
+ assertEquals("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.222199999999999953\",\"t\":\"2014-11-25 18:09:24\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello \",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]);
+ assertEquals("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.222199999999999953\",\"t\":\"2014-11-25 18:02:44\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world \",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]);
+
+ }
+
    // Test that if the fraction of rows that have distinct strings is greater than the configured
    // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length
    // of the dictionary stream for the column will be 0 in the ORC file dump.

Search Discussions

Related Discussions

Discussion Navigation
viewthread | post
Discussion Overview
groupcommits @
categorieshive, hadoop
postedDec 5, '14 at 9:13p
activeDec 5, '14 at 9:13p
posts1
users1
websitehive.apache.org

1 user in discussion

Gates: 1 post

People

Translate

site design / logo © 2021 Grokbase