mysql8.0.17(ubuntu19.10默认mysql)
spark2.1.0
使用时需要在idea中添加相关依赖(maven中找对应版本)
package org.apache.spark.examples.streaming
import java.sql.DriverManager
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
//使用sparkstreaming完成有状态统计,并将结果写入到mysql
object ForeachRDDApp {
def main(args: Array[String]): Unit = {
//配置入口点
val sparkConf=new SparkConf().setMaster("local[*]").setAppName("ForeachRDDApp")
val ssc = new StreamingContext(sparkConf,Seconds(5))
//如果使用了stateful的算子,必须要设置checkpoint
//在生产环境中,建议大家把checkpoint设置到HDFS的某个文件夹中
//设置checkpoint的目录
ssc.checkpoint(".")
ssc.sparkContext.setLogLevel("ERROR")
//输入数据流(DStream)
val lines=ssc.socketTextStream("localhost",6789)
val result = lines.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
//输出打印到控制台
result.print()
//启动StreamingContext,接收数据,然后处理数据
result.foreachRDD(rdd => {
rdd.foreachPartition(partitionOfRecords => {
val connection = createConnection()
partitionOfRecords.foreach(record => {
val sql = "insert into wordcount(word, wordcount) values('" + record._1 + "'," + record._2 + ")"
connection.createStatement().execute(sql)
})
connection.close()
})
})
ssc.start()
ssc.awaitTermination()
}
def createConnection() = {
Class.forName("com.mysql.jdbc.Driver")
DriverManager.getConnection("jdbc:mysql://localhost:3306/spark", "root", "hadoop")
}
}
下面就是通过sql语句完成相同项累加
package org.apache.spark.examples.streaming
import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
object ForeachRDDApp {
//设置日志级别WARN
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
//本地运行设置
val conf: SparkConf = new SparkConf()
.setMaster("local[3]") //至少两个核(运行一个,接收一个)
.setAppName(this.getClass.getSimpleName)
//StreamingContext
val ssc = new StreamingContext(conf, Seconds(4))
//接受socket信息 创建Dstream
val text: ReceiverInputDStream[String] = ssc.socketTextStream("localhost", 6789)
//转换成RDD
text.foreachRDD(rdd => {
val result: RDD[(String, Int)] = rdd.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
result.foreach(println)
//写入数据库
result.foreachPartition(p => {
val conn: Connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/spark?characterEncoding=utf-8", "root", "hadoop")
val statement: PreparedStatement = conn.prepareStatement("create table if not exists wordcount3 (word varchar(20),total int)")
statement.executeUpdate()
//创建连接
p.foreach(t => {
//如果插入的单词已经存在数据库中,则更新这个单词的值(当前出现的次数+数据库当前的次数)
val statement1: PreparedStatement = conn.prepareStatement("select * from wordcount3 where word=?")
statement1.setString(1, t._1)
val set: ResultSet = statement1.executeQuery()
if (set.next()) {
val count: Int = set.getInt("total")
val newcount: Int = count + t._2
//更新
val statement2: PreparedStatement = conn.prepareStatement("update wordcount3 set total=? where word=?")
statement2.setInt(1,newcount)
statement2.setString(2,t._1)
statement2.executeUpdate()
}else{
val statement3: PreparedStatement = conn.prepareStatement("insert into wordcount3 values(?,?)")
statement3.setString(1,t._1)
statement3.setInt(2,t._2)
statement3.executeUpdate()
}
})
conn.close()
})
})
//启动主程序
ssc.start()
//阻塞 等待程序被关闭
ssc.awaitTermination()
}
}
这一个是转载一个博主的
https://blog.csdn.net/weixin_40155674/article/details/80708052