Spark Streaming输出至MySQL

最新推荐文章于 2022-03-14 11:53:43 发布

原创最新推荐文章于 2022-03-14 11:53:43 发布 · 538 阅读

1 ·

CC 4.0 BY-SA版权

本文为博主原创文章，未经博主允许不得转载。

文章标签：

#spark #大数据 #mysql #sparkstreaming

大数据专栏收录该内容

35 篇文章

订阅专栏

本文介绍使用Spark Streaming从Socket数据源实时获取单词并进行统计，统计结果通过c3p0连接池输出至MySQL数据库的方法。包括环境配置、案例实现、操作步骤及运行指导。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

一、环境

系统：Win10
开发工具：scala-IDEA-IDE
-项目管理工具：Maven 3.6.0
-JDK 1.8
-Scala 2.11.11
-Spark 2.1.1
-MySQL 5.7
MySQL c3p0 0.9.1.2 连接池

二、案例

利用从Socket数据源获取单词，进行单词统计，并将统计结果输出至MySQL数据库中；
利用 c3p0 建立数据库连接池，每次连接MySQL都从连接池中获取，减少建立连接的开销；
对 c3p0 连接池进行封装，形成惰性单例模式，减少连接池创建的开销。
MySQL记录输入单词的个数

三、操作

1. 数据库

create database rdd;
use rdd;
CREATE TABLE IF NOT EXISTS wordfreq (
    id BIGINT NOT NULL AUTO_INCREMENT PRIMARY KEY,
    word VARCHAR(20) NOT NULL COMMENT '单词',
    cnt INT NOT NULL COMMENT '单词计数',
    TIME TIMESTAMP NOT NULL DEFAULT '0000-00-00 00:00:00' COMMENT '统计时间')

2. pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.etc</groupId>
    <artifactId>LearnSparkStreaming</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>2.11.8</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.1.1</version>
            <!--<scope>provided</scope>-->
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.2</version>
            <!--<scope>provided</scope>-->
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>2.1.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.1.1</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.38</version>
        </dependency>
        <dependency><!-- MySQL 连接池依赖包 -->
            <groupId>c3p0</groupId>
            <artifactId>c3p0</artifactId>
            <version>0.9.1.2</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.scala-tools</groupId>
                <artifactId>maven-scala-plugin</artifactId>
                <version>2.15.2</version>
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
                        <goals>
                            <goal>compile</goal>
                        </goals>
                        <configuration>
                            <includes>
                                <include>**/*.scala</include>
                            </includes>
                        </configuration>
                    </execution>
                    <execution>
                        <id>scala-test-compile</id>
                        <goals>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

3. c3p0 工具类

package com.util


import com.mchange.v2.c3p0.ComboPooledDataSource
import java.sql.Connection

class MysqlPool extends Serializable {
  private val cpds: ComboPooledDataSource = new ComboPooledDataSource(true) //自动注册
  try {
    //设置Mysql信息
    cpds.setJdbcUrl("jdbc:mysql://localhost:3306/rdd?useUnicode=true&characterEncoding=UTF-8")
    cpds.setDriverClass("com.mysql.jdbc.Driver")
    cpds.setUser("root")
    cpds.setPassword("root")
    cpds.setMaxPoolSize(10) //连接池最大连接数
    cpds.setMinPoolSize(2) //连接池最小连接数
    cpds.setAcquireIncrement(2) //连接数每次递增数量
    cpds.setMaxStatements(180) //连接池最大空闲时间
  } catch {
    case e: Exception => e.printStackTrace()
  }

  //获取连接
  def getConnection: Connection = {
    try {
      return cpds.getConnection()
    } catch {
      case e: Exception => e.printStackTrace()
        null
    }
  }
}

//惰性单例，真正计算时才初始化对象
object MysqlManager {
  @volatile private var mysqlPool: MysqlPool = _

  def getMysqlPool: MysqlPool = {
    if (mysqlPool == null) {
      synchronized {
        if (mysqlPool == null) {
          mysqlPool = new MysqlPool
        }
      }
    }
    mysqlPool
  }

}

4. 单词统计类

package com.wisdom

import com.util.MysqlManager
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext, Time}

object WordCountToMysql {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("NetworkWordCount")
    val ssc = new StreamingContext(conf, Seconds(5))
    val lines = ssc.socketTextStream("localhost", 9999)

    // Split each line into words
    val words = lines.flatMap(_.split(" "))

    //import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3
    // Count each word in each batch
    val pairs = words.map(word => (word, 1))
    val wordCounts = pairs.reduceByKey(_ + _)

    //在reduce聚合之后，输出结果至MySQL（输出操作）
    wordCounts.foreachRDD((rdd: RDD[(String, Int)], time: Time) => {
      //RDD为空时，无需再向下执行，否则在分区中还需要获取数据库连接（无用操作）
      if (!rdd.isEmpty()) {
        //一个分区执行一批SQL
        rdd.foreachPartition((partition: Iterator[(String, Int)]) => {
          //partition和record共同位于本地计算节点Worker，故无需序列化发送conn和statement
          //获取Mysql连接
          val conn = MysqlManager.getMysqlPool.getConnection
          if (conn == null) {
            //做好判空，否则如果某个新加的节点还没设置访问数据库权限，会卡在数据库连接处，不断尝试连接
            println("conn is null.") //在Worker节点的Executor中打印
          } else {
            println("conn is not null.")
            //创建语句
            val statement = conn.createStatement()
            try {
              conn.setAutoCommit(false) //设置手动提交
              partition.foreach((record: (String, Int)) => {
                //创建sql，加入批处理
                //注：字符串列的值需用单引号括起来，如：word列
                val sql = "insert into wordfreq (word, cnt, time) value ('" + record._1 + "', " + record._2 + ", sysdate());"
                statement.addBatch(sql) //加入语句batch
              })
              statement.executeBatch() //批量执行sql语句
              conn.commit() //事务提交
            } catch {
              case e: Exception => e.printStackTrace()
            } finally {
              statement.close() //关闭语句
              conn.close() //关闭连接
            }
          }
        })
      }
    })
    //打印从DStream中生成的RDD的前10个元素到控制台中
    wordCounts.print() //print() 是输出操作，默认前10条数据
    ssc.start() // Start the computation
    ssc.awaitTermination() // Wait for the computation to terminate

  }
}