算子:distinct union intersection subtract cartesian mapToPair flatMapToPair
distinct union intersection subtract cartesian
java版本
public class rddJava1 {
public static void main(String[] args) {
SparckConf conf = new SparkConf().setAppName().setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<String> strings = Arrays.asList("aa","bb","cc","aa","cc");
JavaRDD<String> strRdd = sc.parallelize(strirngs);
//distinct
JavaRdd<String> distinctRdd= strRdd.distinct();
List<String> collect = distinctRdd.collect();
for (String s : collect) {
System.out.println(s);
}
//union
List<String> strings2 = new ArrayList<>();
strings2.add("aa");
strings2.add("bb");
strings2.add("ee");
JavaRDD<String> strRdd2 = sc.parallelize(strings2);
JavaRDD<String> unionRdd= strRdd2.union(strRdd);
List<String> collect = unionRdd.collect();
for (String s : collect) {
System.out.println(s);
}
//intersect
JavaRDD<String> RDD1 = sc.parallelize(Arrays.asList("aa", "aa", "bb", "cc", "dd"));
JavaRDD<String> RDD2 = sc.parallelize(Arrays.asList("aa","dd","ff"));
JavaRDD<String> intersectionRDD = RDD1.intersection(RDD2);
List<String> collect = intersectionRDD.collect();
for (String str:collect) {
System.out.print(str+" ");
}
//subtract
JavaRDD<String> subtractRdd = strRdd.subtract(strRdd2);
List<String> collect1 = substractRdd.collect();
for (String s : collect1) {
System.out.println(s);
}
//cartesian
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("1","2","3","a"));
JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("1","3","b"))
JavaPairRDD<String, String> cartesianRdd = rdd1.cartesian(rdd2);
List<Tuple2<String, String>> collect2 = cartesianRdd.collect();
for (Tuple2<String, String> tp2 : collect2) {
System.out.println(tp2);
}
}
}
scala版本
RDD1.cartesian(RDD2) 返回RDD1和RDD2的笛卡儿积;这里只写cartesian,因为其他几个都是常见的Array方法
var RDD1 = sc.parallelize(List("1","2","3"))
var RDD2 = sc.parallelize(List("a","b","c"))
var cartesianRDD = RDD1.cartesian(RDD2)
cartesianRDD.collect
//res: Array[(String, String)] = Array((1,a), (1,b), (1,c), (2,a), (2,b), (2,c), (3,a), (3,b), (3,c))
mapToPair flatMapToPair
mapToPait
Java:
public class MapToPairJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("in/sample.txt");
JavaPairRDD<String, Integer> mapToPairRdd = lines.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
String[] split = s.split(" ");
String key = split[0];
return new Tuple2<>(key,1);
}
});
JavaPairRDD<String, Integer> stringIntegerJavaPairRDD = lines.flatMapToPair(new PairFlatMapFunction<String,String,Integer>() {
@Overridee
public Iterator<Tuple2<String,Integer>> call(string s) throws Exception {
List<Tuple2<String,Integer>> list = new ArrayList<>();
String[] split = s.split(" ");
for (int i = 0; i < split.lenght; i++) {
String key = split[i];
Tuple2<String,Integer> tp2 = new Tuple2<>(key,1);
list.add(tp2);
}
return list.iterator();
}
});
List<Tuple2<String, Integer>> collect = stringIntegerJavaPairRDD.collect();
for (Tuple2<String, Integer> tuple2 : collect) {
System.out.println("key:" + tuple2._1 + " value:" + tuple2._2);
}
}
}
Scala:
object MapToPairScala {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName()
val sc = new SparkContext(conf)
val lines = sc.textFile("in/sample.txt")
val pairs = lines.map(x=>(x.split(" ")(0),1))
pairs.collect.foreach(println)
pairs collect() foreach println
}
}