FAQ
Question about how to use hive, trying to make sure what I'm doing is making
sense before going too far.

I'm trying to update hbase with the results from a hive query. What I'm
doing right now is the hive jdbc server and run the query then the mapreduce
job, then read the output files and use that to update HBase. This works, I
just wonder if there's a better way.

My questions are:
1) Hopefully I'm asking this right, but is there a way to save this as a
job, where it doesn't have to build it each time? We'd probably run this
exact same query a few times a day.
2) Is JDBC a good way to do this? I've seen some mention of using the
CliDriver, but never able to get that working correctly, would that be a
better option, or are there other ways to do this?


Thanks!



If anyone is interested, he's my code


public class SetLabels extends Configured implements Tool
{
public static void main(String [] args) throws Exception {
try {
Class.forName("org.apache.hadoop.hive.jdbc.HiveDriver");
} catch (ClassNotFoundException e) {
System.out.println("Unable to load Hive Driver");
System.exit(1);
}

try {
Connection con =
DriverManager.getConnection("jdbc:hive://localhost:10000/default", "");
Statement stmt = con.createStatement();

sql = "INSERT OVERWRITE DIRECTORY
'hdfs://localhost:8020/tmp/labels'"
+ "SELECT t.key, COALESCE(ul.label, dl.label, u.description)
AS label "
+ "FROM url u "
+ "LEFT OUTER JOIN default_labels dl ON t.description =
dl.description "
+ "LEFT OUTER JOIN user_labels ul ON t.description =
ul.description";
stmt.executeQuery(sql);

} catch (SQLException e) {
System.exit(1);
}

// run Map/Reduce to read UPDATED_LABELS_PATH and update HBase
int res = ToolRunner.run(new Configuration(), new SetLabels(),
args);
System.exit(res);
}

public int run(String [] args) throws IOException, InterruptedException,
ClassNotFoundException {
runJob();
return 0;
}

public void runJob() throws IOException, InterruptedException,
ClassNotFoundException {
Configuration conf = getConf();

Path inputPath = new Path("hdfs://localhost:8020/tmp/labels");

Job job = new Job(conf, "Set Labels");
FileInputFormat.setInputPaths(job, inputPath);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(SetLabelsMapper.class);
TableMapReduceUtil.initTableReducerJob("urls",
IdentityTableReducer.class, job);
job.setNumReduceTasks(0);

job.waitForCompletion(true);
}
}

public class SetLabelsMapper extends Mapper<Object, Text,
ImmutableBytesWritable, Put> {
private String HIVE_DELIMETER = "" + (char)01;

@Override
public void map(Object key, Text line, Context context ) throws
IOException {
String tokens[] = line.toString().split(HIVE_DELIMETER, -1);

byte[] rowKey = Bytes.toBytes(tokens[0]);
String label = tokens[1];

Put put = new Put(rowKey);
put.add(Bytes.toBytes("details"), Bytes.toBytes("label"),
Bytes.toBytes(label));

try {
context.write(new ImmutableBytesWritable(rowKey), put);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}

Search Discussions

  • John Sichi at Sep 20, 2010 at 6:11 pm
    Hive is capable of writing directly into HBase via INSERT:

    http://wiki.apache.org/hadoop/Hive/HBaseIntegration

    JVS

    On Sep 20, 2010, at 7:56 AM, Peter Thoman wrote:

    Question about how to use hive, trying to make sure what I'm doing is making sense before going too far.

    I'm trying to update hbase with the results from a hive query. What I'm doing right now is the hive jdbc server and run the query then the mapreduce job, then read the output files and use that to update HBase. This works, I just wonder if there's a better way.

    My questions are:
    1) Hopefully I'm asking this right, but is there a way to save this as a job, where it doesn't have to build it each time? We'd probably run this exact same query a few times a day.
    2) Is JDBC a good way to do this? I've seen some mention of using the CliDriver, but never able to get that working correctly, would that be a better option, or are there other ways to do this?


    Thanks!



    If anyone is interested, he's my code


    public class SetLabels extends Configured implements Tool
    {
    public static void main(String [] args) throws Exception {
    try {
    Class.forName("org.apache.hadoop.hive.jdbc.HiveDriver");
    } catch (ClassNotFoundException e) {
    System.out.println("Unable to load Hive Driver");
    System.exit(1);
    }

    try {
    Connection con = DriverManager.getConnection("jdbc:hive://localhost:10000/default", "");
    Statement stmt = con.createStatement();

    sql = "INSERT OVERWRITE DIRECTORY 'hdfs://localhost:8020/tmp/labels'"
    + "SELECT t.key, COALESCE(ul.label, dl.label, u.description) AS label "
    + "FROM url u "
    + "LEFT OUTER JOIN default_labels dl ON t.description = dl.description "
    + "LEFT OUTER JOIN user_labels ul ON t.description = ul.description";
    stmt.executeQuery(sql);

    } catch (SQLException e) {
    System.exit(1);
    }

    // run Map/Reduce to read UPDATED_LABELS_PATH and update HBase
    int res = ToolRunner.run(new Configuration(), new SetLabels(), args);
    System.exit(res);
    }

    public int run(String [] args) throws IOException, InterruptedException, ClassNotFoundException {
    runJob();
    return 0;
    }

    public void runJob() throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = getConf();

    Path inputPath = new Path("hdfs://localhost:8020/tmp/labels");

    Job job = new Job(conf, "Set Labels");
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(SetLabelsMapper.class);
    TableMapReduceUtil.initTableReducerJob("urls", IdentityTableReducer.class, job);
    job.setNumReduceTasks(0);

    job.waitForCompletion(true);
    }
    }

    public class SetLabelsMapper extends Mapper<Object, Text, ImmutableBytesWritable, Put> {
    private String HIVE_DELIMETER = "" + (char)01;

    @Override
    public void map(Object key, Text line, Context context ) throws IOException {
    String tokens[] = line.toString().split(HIVE_DELIMETER, -1);

    byte[] rowKey = Bytes.toBytes(tokens[0]);
    String label = tokens[1];

    Put put = new Put(rowKey);
    put.add(Bytes.toBytes("details"), Bytes.toBytes("label"), Bytes.toBytes(label));

    try {
    context.write(new ImmutableBytesWritable(rowKey), put);
    } catch (InterruptedException e) {
    e.printStackTrace();
    }
    }
    }

Related Discussions

Discussion Navigation
viewthread | post
Discussion Overview
groupuser @
categorieshive, hadoop
postedSep 20, '10 at 2:57p
activeSep 20, '10 at 6:11p
posts2
users2
websitehive.apache.org

2 users in discussion

John Sichi: 1 post Peter Thoman: 1 post

People

Translate

site design / logo © 2021 Grokbase