hbase bulkload 批处理插入数据 多列数据

本文介绍使用Apache Spark和HBase进行数据处理的方法。通过示例代码展示了如何配置Spark作业,从文本文件读取数据,并将其转换为适合HBase的格式。接着介绍了如何将转换后的数据批量加载到HBase中,包括创建临时HFile、使用LoadIncrementalHFiles工具等关键步骤。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

下面是submit.sh 文件内容(ps:注意 几点内容,jars 下面的列举要在一行,source jar 要在一行,每行结束的反斜杠,注意写法:空格 反斜杠,然后就换行)

#!/bin/bash

CDH_BIN='/data/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/bin'
LIB_PATH='/data/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib'

$CDH_BIN/spark-submit \
 --class com.t.t.Simple \
 --jars \
 $LIB_PATH/hbase/hbase-hadoop-compat.jar,$LIB_PATH/hbase/hbase-hadoop2-compat.jar,$LIB_PATH/hbase/hbase-zookeeper-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-thrift-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-testing-util-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-spark-it-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-spark-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-shell-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-shaded-protobuf-2.1.0.jar,$LIB_PATH/hbase/hbase-shaded-netty-2.1.0.jar,$LIB_PATH/hbase/hbase-shaded-miscellaneous-2.1.0.jar,$LIB_PATH/hbase/hbase-rsgroup-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-rest-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-resource-bundle-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-replication-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-protocol-shaded-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-protocol-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-procedure-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-metrics-api-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-it-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-http-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-hadoop2-compat-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-external-blockcache-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-endpoint-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-metrics-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-server-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-mapreduce-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-common-2.1.0-cdh6.2.0.jar,$LIB_PATH/hbase/hbase-client-2.1.0-cdh6.2.0.jar \
 /tmp/SparkOnLinux-1.0-SNAPSHOT.jar
 

以下代码几点注意:

KeyValue 中,[rowkey的值,value 的值],得是String类型,如果本来是int,float 要写转换成String,然后再Bytes.toBytes(String),

否则插入数据库后,int,float 的值会是不可读的. 比如:\x00\x00\x00\x00

 

package com.t.t;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.client.HRegionLocator;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.MetricsIOWrapper;
import org.apache.hadoop.hbase.mapred.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;


public class Simple {

        public static void main(String[] args) throws IOException {

            SparkConf conf = new SparkConf().setAppName("Spark app : submit on liunx");
            JavaSparkContext jsc = new JavaSparkContext(conf);
            SparkSession spark = SparkSession.builder()
                    .config(conf)
                    .enableHiveSupport()
                    .getOrCreate();
            //1. read text file
            String path="/tmp/test_stu.txt";
            JavaRDD<String> ds=spark.read().textFile(path).javaRDD();

            JavaRDD<Row> javaRDD = ds.map(new Function<String, Row>() {
                @Override
                public Row call(String s) throws Exception {
                    String [] parts=s.split(",");
                    System.out.println(Arrays.toString(parts));
                    return RowFactory.create(parts[0],parts[1],parts[2]);
                }
            });

            // define hbase config
            Configuration hbaseConf = HBaseConfiguration.create();

            final String table_name="stu2";
            final String columnFamily="info";
            final String fields="key,id,name";
            hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, table_name);
            Job job = Job.getInstance();
            job.setMapOutputKeyClass(ImmutableBytesWritable.class);
            job.setMapOutputValueClass(KeyValue.class);
            job.setOutputFormatClass(HFileOutputFormat2.class);


            Connection conn = ConnectionFactory.createConnection(hbaseConf);
            TableName tableName = TableName.valueOf(table_name);
            RegionLocator regionLocator = conn.getRegionLocator(tableName);
            //HRegionLocator regionLocator = new HRegionLocator(tableName, (ClusterConnection) conn);
            Table realTable = ((ClusterConnection) conn).getTable(tableName);

            HFileOutputFormat2.configureIncrementalLoad(job, realTable, regionLocator);

            //3. maptoPair
            final int rowKeyIndex=0;  //把第一列当做rowkey
            JavaPairRDD<ImmutableBytesWritable, KeyValue> javaPairRDD =
                    javaRDD.mapToPair(new PairFunction<Row, ImmutableBytesWritable, List<Tuple2<ImmutableBytesWritable, KeyValue>>>() {
                        @Override
                        public Tuple2<ImmutableBytesWritable, List<Tuple2<ImmutableBytesWritable, KeyValue>>> call(Row row) throws Exception
                        {
                            //return list1
                            //list1 放置 每个tuple<rowkey,KeyValue>
                            List<Tuple2<ImmutableBytesWritable, KeyValue>> list1 = new ArrayList<>();

                            int rowkey = Integer.parseInt(row.getString(rowKeyIndex));
                            ImmutableBytesWritable writable = new ImmutableBytesWritable(Bytes.toBytes(rowkey));

                            // list2 放置fieldsNames
                            // sort columns。这里需要对列进行排序,不然会报错
                            ArrayList<Tuple2<Integer, String>> list2 = new ArrayList<>();
                            String[] columns = fields.split(",");
                            for (int i = 0; i < columns.length; i++) {
                                list2.add(new Tuple2<Integer, String>(i, columns[i]));
                            }
                            //循环FieldsName 匹配 Row的每列,
                            int i=0;  //用于循环打印
                            for (Tuple2<Integer, String> t : list2) {
                                ++i;
                                //String[] fieldNames = row.schema().fieldNames();
                                String[] fieldNames=fields.split(",");
                                // 不将作为rowkey的字段存在列里面
                                if (t._2().equals(fieldNames[rowKeyIndex])) {
                                    System.out.println(String.format("%s == %s continue", t._2(), fieldNames[rowKeyIndex]));
                                    continue;
                                }
                                //_1 放置的是tuple 第一个值,即列号
                                //String value = (String) row.getAs(t._1);
                                KeyValue kv = new KeyValue(Bytes.toBytes(new String(String.valueOf(rowkey))),
                                        Bytes.toBytes(columnFamily),
                                        Bytes.toBytes(t._2()), Bytes.toBytes((String) row.getAs(t._1)));
                                list1.add(new Tuple2<>(writable, kv));
                            }
                            return new Tuple2<>(writable, list1);
                        }
                        // 这里一定要按照rowkey进行排序,这个效率很低,目前没有找到优化的替代方案
                    }).sortByKey().flatMapToPair(new PairFlatMapFunction<Tuple2<ImmutableBytesWritable, List<Tuple2<ImmutableBytesWritable, KeyValue>>>,
                            ImmutableBytesWritable, KeyValue>() {
                        @Override
                        public Iterator<Tuple2<ImmutableBytesWritable, KeyValue>> call(Tuple2<ImmutableBytesWritable,
                                List<Tuple2<ImmutableBytesWritable, KeyValue>>> tuple2s) throws Exception {

                            return tuple2s._2().iterator();
                        }
                    });


            // 创建HDFS的临时HFile文件目录
            String temp = "/tmp/testBulkLoad/" + table_name + "_" + System.currentTimeMillis();
            javaPairRDD.saveAsNewAPIHadoopFile(temp, ImmutableBytesWritable.class,
                    KeyValue.class, HFileOutputFormat2.class, job.getConfiguration());

            LoadIncrementalHFiles loader = new LoadIncrementalHFiles(hbaseConf);
            Admin admin = conn.getAdmin();
            loader.doBulkLoad(new Path(temp), admin, realTable, regionLocator);
            jsc.stop();

        }

}

 插入的数据,hbase shell 删除和修改成功

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值