1、准备文件
wget http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/spam.data
2、加载文件
scala> val inFile = sc.textFile("/tmp/spam")
3、显示一行
scala> inFile.first()
4、函数运用
(1)map
val nums = inFile.map(x=>x.split(' ').map(_.toDouble))
nums.first()
(2)collecct
val rdd = sc.parallelize(List(1,2,3,4,5))
val mapRdd = rdd.map(2*_)
mapRdd.collect
(3)filter
val filterRdd = sc.parallelize(List(1,2,3,4,5)).map(_*2).filter(_>5)
filterRdd.collect
(4)flatMap
val rdd = sc.textFile("/home/scipio/README.md")
rdd.count
rdd.cache
rdd.count
val wordCount = rdd.flatMap(_.split(' ')).map(x=>(x,1)).reduceByKey(_+_)
wordCount.collect
wordCount.saveAsTextFile("/home/scipio/wordCountResult.txt")
(5)union
val rdd = sc.parallelize(List(('a',1),('a',2)))
val rdd2 = sc.parallelize(List(('b',1),('b',2)))
rdd union rdd2
res3.collect
(6) join
val rdd1 = sc.parallelize(List(('a',1),('a',2),('b',3),('b',4)))
val rdd2 = sc.parallelize(List(('a',5),('a',6),('b',7),('b',8)))
rdd1 join rdd2
res1.collect
(7)lookup
val rdd1 = sc.parallelize(List(('a',1),('a',2),('b',3),('b',4)))
rdd1.lookup('a')
(8)groupByKey
val wc = sc.textFile("/home/scipio/README.md").flatMap(_.split(' ')).map((_,1)).groupByKey
wc.collect
(9)sortByKey
val rdd = sc.textFile("/home/scipio/README.md")
val wordcount = rdd.flatMap(_.split(' ')).map((_,1)).reduceByKey(_+_)
val wcsort = wordcount.map(x => (x._2,x._1)).sortByKey(false).map(x => (x._2,x._1))
wcsort.saveAsTextFile("/home/scipio/sort.txt")
wget http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/spam.data
2、加载文件
scala> val inFile = sc.textFile("/tmp/spam")
3、显示一行
scala> inFile.first()
4、函数运用
(1)map
val nums = inFile.map(x=>x.split(' ').map(_.toDouble))
nums.first()
(2)collecct
val rdd = sc.parallelize(List(1,2,3,4,5))
val mapRdd = rdd.map(2*_)
mapRdd.collect
(3)filter
val filterRdd = sc.parallelize(List(1,2,3,4,5)).map(_*2).filter(_>5)
filterRdd.collect
(4)flatMap
val rdd = sc.textFile("/home/scipio/README.md")
rdd.count
rdd.cache
rdd.count
val wordCount = rdd.flatMap(_.split(' ')).map(x=>(x,1)).reduceByKey(_+_)
wordCount.collect
wordCount.saveAsTextFile("/home/scipio/wordCountResult.txt")
(5)union
val rdd = sc.parallelize(List(('a',1),('a',2)))
val rdd2 = sc.parallelize(List(('b',1),('b',2)))
rdd union rdd2
res3.collect
(6) join
val rdd1 = sc.parallelize(List(('a',1),('a',2),('b',3),('b',4)))
val rdd2 = sc.parallelize(List(('a',5),('a',6),('b',7),('b',8)))
rdd1 join rdd2
res1.collect
(7)lookup
val rdd1 = sc.parallelize(List(('a',1),('a',2),('b',3),('b',4)))
rdd1.lookup('a')
(8)groupByKey
val wc = sc.textFile("/home/scipio/README.md").flatMap(_.split(' ')).map((_,1)).groupByKey
wc.collect
(9)sortByKey
val rdd = sc.textFile("/home/scipio/README.md")
val wordcount = rdd.flatMap(_.split(' ')).map((_,1)).reduceByKey(_+_)
val wcsort = wordcount.map(x => (x._2,x._1)).sortByKey(false).map(x => (x._2,x._1))
wcsort.saveAsTextFile("/home/scipio/sort.txt")