缘由
开发者将数据缓存在每台机器上,不需要机器之间进行频繁的网络IO,减少网络开销,CPU序列化以及反序列化,广播变量分为可变数据类型(例如累加器),不可变类型
案例
通过城市id补全用户城市信息
import org.apache.spark.{SparkConf, SparkContext}
object UserCityBrocast {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("UserCityBrocast")
val sc = new SparkContext(conf)
val cityDetailMap = Map(
"010" -> "北京",
"021" -> "上海",
"020" -> "广州",
"0755" -> "深圳")
val userDetailMap = Map(
"15837312345" -> ("userID_001", "Alice"),
"15837322331" -> ("userID_002", "Bob"),
"13637316666" -> ("userID_003", "Thomas"),
"18537312399" -> ("userID_004", "Karen"),
"13637312376" -> ("userID_005", "Tom"),
"13737312908" -> ("userID_006", "Kotlin"))
val cdmBroadcast = sc.broadcast(cityDetailMap)
val udmBroadcast = sc.broadcast(userDetailMap)
val userArray = Array(
("010", "15837322331"),
("010", "18537312399"),
("0755", "13737312908"),
("020", "13637312376"),
("020", "15837312345"))
val userRDD = sc.parallelize(userArray, 2)
val aggregateRDD = userRDD.aggregateByKey(collection.mutable.Set[String]())(
(telephoneSet, telephone) => telephoneSet += telephone,
(telephoneSet1, telephoneSet2) => telephoneSet1 ++= telephoneSet2)
val resultRDD = aggregateRDD.map(info => {
val cityInfo = CityInfo(info._1, cdmBroadcast.value(info._1))
val userInfoSet = collection.mutable.Set[UserInfo]()
for (telephone <- info._2) {
val idAndName = udmBroadcast.value(telephone)
val userInfo = UserInfo(idAndName._1, telephone, idAndName._2)
userInfoSet.add(userInfo)
}
(cityInfo, userInfoSet)
})
println(resultRDD.collect.mkString(","))
//释放广播变量不再使用
cdmBroadcast.unpersist
udmBroadcast.unpersist
}
}
case class CityInfo(cityCode: String, cityName: String)
case class UserInfo(userID: String, telephone: String, userName: String)