一、主要流程图
flink 1.12.0 hive 2.3.4 kafka 2.11 hadoop 2.7.2 scala 2.1.11、maxwell,hbase 1.2
二、实现步骤
1.mysql中的维表数据实时同步到hbase表
1.1 mysql开启binlog 日志 修改 /etc/my.cnf 添加如下四行配置信息:
然后重启mysql服务器 service mysql restart 查看mysql数据存储目录:/var/lib/mysql(默认)有如下日志文件则说明生成了binlog日志
1.2 配置maxwell监控并解析binlog日志数据变动信息发往kafka
修改maxwell配置文件信息 vi config.properties
host=192.168.25.128
user=maxwell
password=123456
client_id=maxwell-1
producer=kafka
kafka.bootstrap.servers=ELK01:9092,ELK02:9092,ELK03:9092
kafka_topic=wm_db
# 初始化db数据时使用
client_id=maxwell_1
1.3 编写flink程序把不同的表数据分发到不同的kafka topic中
maxllwell往kafka发送消息的报文结果如下:
{"database":"wm","table":"tm_key_users","type":"update","ts":1626876186,"xid":19715,"commit":true,"data":{"user_id":"1","user_name":"4","province_code":"1","is_key_user":1,"remark":"1","create_time":"2021-07-21 14:03:06","update_time":"0000-00-00 00:00:00"},"old":{"user_name":"2","create_time":"2021-07-21 13:59:49"}}
根据报文中的table字段值不同分发到不同的kafka Topic 创建 KeyedSerializationSchemaWrapper 匿名类,重新实现里面的 getTargetTopic 方法
val kafkaProducerSin = new FlinkKafkaProducer[String](
"",
new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema() ){
override def getTargetTopic(element: String): String ={
val jsonObject: JSONObject = JSON.parseObject(element)
val tableName = jsonObject.get("table")
//默认为空则不覆盖原来的topic
val topic:String="ods_"+tableName
return topic;
}
},
p,
FlinkKafkaProducer.Semantic.AT_LEAST_ONCE)
//sink到kafka
filterStream.addSink(kafkaProducerSin)
整个程序代码如下:
package com.tang.crawler.wb
import java.util.Properties
import com.alibaba.fastjson.{JSON, JSONObject}
import org.apache.flink.api.common.functions.FilterFunction
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.internals.KeyedSerializationSchemaWrapper
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, FlinkKafkaProducer, KafkaSerializationSchema}
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
object DbTableToKafkaSin {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val tableEnv = StreamTableEnvironment.create(env)
val config = tableEnv.getConfig
env.enableCheckpointing(1000,CheckpointingMode.EXACTLY_ONCE)
//kafka属性
val properties = new Properties()
properties.setProperty("bootstrap.servers", "ELK01:9092")
properties.setProperty("group.id", "consumer-group")
properties.setProperty("key.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("value.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("auto.offset.reset", "latest")
//1.source输入---kafka作为source
//入参 topic SimpleStringSchema--读取kafka消息是string格式 properties kafka的配置
val kafkaSource = new FlinkKafkaConsumer[String]("wm_db", new SimpleStringSchema(), properties)
import org.apache.flink.streaming.api.scala._
val inputStream = env.addSource(kafkaSource)
inputStream.print()
// db中需要监控的表列表
val list = List[String]("tm_key_users","tm_province")
//定义list 只要list中的表的变动信息
val filterStream = inputStream.filter(new FilterFunction[String](){
override def filter(value: String): Boolean ={
val jsonObject: JSONObject = JSON.parseObject(value)
if(null==jsonObject){
return false;
}
val tableName = jsonObject.get("table")
if(null==tableName){
return false;
}
if(list.contains(tableName)){
return true;
}
return false
}
})
val p = new Properties()
p.setProperty("bootstrap.servers", "ELK01:9092")
p.setProperty("group.id", "consumer-group")
p.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
p.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
val kafkaProducerSin = new FlinkKafkaProducer[String](
"",
new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema() ){
override def getTargetTopic(element: String): String ={
val jsonObject: JSONObject = JSON.parseObject(element)
val tableName = jsonObject.get("table")
//默认为空则不覆盖原来的topic
val topic:String="ods_"+tableName
return topic;
}
},
p,
FlinkKafkaProducer.Semantic.AT_LEAST_ONCE)
//sink到kafka
filterStream.addSink(kafkaProducerSin)
env.execute("db数据分发")
}
}
1.4 创建flink kafka表 表格式为maxwell格式 解析maxwell发到kafka中的记录信息形成表数据(maxwell可以监控到增删改的变动,db中定义删除数据为逻辑删除即修改标识位即可故无删除数据类型的消息)
创建重点关注人员flink kafka表 消费kafka中的topic数据
-- 关键用户表
drop table if exists kafka_ods_tm_key_users;
CREATE TABLE kafka_ods_tm_key_users (
user_id STRING COMMENT '用户id' ,
user_name STRING COMMENT '用户名称',
province_code STRING COMMENT '省份编码',
is_key_user INT COMMENT '是否关键用户 0-否 1-是',
remark STRING COMMENT '备注',
create_time STRING,
update_time STRING
) WITH (
'connector' = 'kafka',
'topic' = 'ods_tm_key_users',
'properties.bootstrap.servers' = 'ELK01:9092',
'properties.group.id' = 'ods_group',
'format' = 'maxwell-json'
);
-- 省份表
drop table if exists kafka_ods_tm_province;
CREATE TABLE kafka_ods_tm_province (
id INT COMMENT '主键id' ,
code STRING COMMENT '省份编码',
name STRING COMMENT '省份名称',
code_level INT COMMENT '级别',
remark STRING COMMENT '备注'
) WITH (
'connector' = 'kafka',
'topic' = 'ods_tm_province',
'properties.bootstrap.servers' = 'ELK01:9092',
'properties.group.id' = 'ods_group',
'format' = 'maxwell-json'
);
1.4 测试mysql表数据变动是否实时同步到flink kafka表中
flink kafka表初始状态:
mysql中增加 用户id 9 用户名称 为 张三的数据
修改用户id为 9 的 用户名称 为 李四的数据 则
删除数据一般是逻辑删除 也即是修改操作,故db的增删改变动都会实时同步到flink kafka表中
1.5 flink kafka table中的数据写到flink hbase表中 (hbase表中形成全量数据,kafka表中的是变动数据)
在hbase中创建hbase表及在flink中创建flink hbase表
--hbase中创建表
create 'wm:hbase_tm_key_users', 'info';
-- flink 中hbase 重点用户表
drop table if exists hbase_tm_key_users;
CREATE TABLE hbase_tm_key_users (
rowkey STRING,
info ROW<user_name STRING,province_code STRING,is_key_user INT,remark STRING>,
PRIMARY KEY (rowkey) NOT ENFORCED
) WITH (
'connector' = 'hbase-1.4',
'table-name' = 'wm:hbase_tm_key_users',
'zookeeper.quorum' = 'ELK01:2181,ELK02:2181,ELK03:2181',
'zookeeper.znode.parent' = '/hbase'
);
--读取kafka表数据写入到hbase中 hbase可得到全量的数据与mysql表数据实时保持一致
INSERT INTO hbase_tm_key_users
SELECT user_id, ROW(user_name, province_code,is_key_user,remark) as info from kafka_ods_tm_key_users;
--hbase中创建省份表
create 'wm:hbase_tm_province', 'info';
-- flink 中hbase 省份表
drop table if exists hbase_tm_province;
CREATE TABLE hbase_tm_province (
rowkey STRING,
info ROW<name STRING,code_level INT>,
PRIMARY KEY (rowkey) NOT ENFORCED
) WITH (
'connector' = 'hbase-1.4',
'table-name' = 'wm:hbase_tm_province',
'zookeeper.quorum' = 'ELK01:2181,ELK02:2181,ELK03:2181',
'zookeeper.znode.parent' = '/hbase'
);
--读取kafka表数据写入到hbase中 hbase可得到全量的数据与mysql表数据实时保持一致
INSERT INTO hbase_tm_province
SELECT code, ROW(name,code_level) as info from kafka_ods_tm_province;
测试
flink中hbase数据初始数据
mysql 修改 code为 2004的数据的name值为广东省及增加 code:2005 name为北京市的数据 查看flink hbase表中的数据
2.用户信息流表与重点关注维表、省份表关联
3.用户信息流表与重点关注维表、省份表关联
4.舆情信息流也用户信息流join 并把结果写入hive(es)中
5.测试验证
细节待补充
主要是maxwell 监控采集到mysql中舆情数据表、用户信息表信息,用户表先与重点关注人员维度表、省份维度表关联得到用户信息维度表,再与舆情数据表实时join 最终落到es中 供各项指标查询