链接🔗:GitHub - josonle/MapReduce-Demo: Hadoop,MapReduce编程学习练手实例
作业任务:
任务跟MapReduce编程实践(Hadoop3.2.2)-CSDN博客一样,编写Java,生成。jar包。运行。
常用的命令:
1.启动Hadoop
cd /usr/local/hadoop
./sbin/start-dfs.sh
2.删除hdfs下的input\output
./bin/hdfs dfs -rm -r input
./bin/hdfs dfs -rm -r output
3.新建input
./bin/hdfs dfs -mkdir input
4.上传文件
./bin/hdfs dfs -put ./date.txt input
5.运行.jar包
./bin/hadoop jar ./myapp/DateDistinct.jar input output
6.输出
./bin/hdfs dfs -cat output/*
3、从链接打开README.md文件查看文档说明,完成理解分析运行以下代码(部分代码有误,需要自己修改),
a)通过执行rand.sh生成数据
先输入,获取权限
chmod +x helloworld.sh
再用,生成数据,保存为Date.txt
./rand.sh

b)数据去重 DateDistinct.java, 最后显示以下效果,只显示不重复的日期(文档中格式有误)
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class DateDistinct {
public static class DateDistinctMapper extends Mapper<Object, Text, Text, NullWritable> {
public void map(Object key, Text value, Context context )
throws IOException, InterruptedException {
String[] strs = value.toString().split(" ");
Text date = new Text(strs[0]);
context.write(date, NullWritable.get());
}
}
public static class DateDistinctReducer extends Reducer<Text,NullWritable,Text,NullWritable> {
public void reduce(Text key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if(otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "DateDistinct");
job.setJarByClass(DateDistinct.class);
job.setMapperClass(DateDistinct.DateDistinctMapper.class);
job.setCombinerClass(DateDistinct.DateDistinctReducer.class);
job.setReducerClass(DateDistinct.DateDistinctReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
for(int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}

c)数据统计 DateCount.java
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class DateCount {
public static class DateCountMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
public void map(Object key, Text value, Context context )
throws IOException, InterruptedException {
String[] strs = value.toString().split(" "); //按空格分割输入
Text date = new Text(strs[0]); //获取日期
context.write(date, one); //将日期和常数1作为Map输出
}
}
public static class DateCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if(otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "DateCount");
job.setJarByClass(DateCount.class);
job.setMapperClass(DateCount.DateCountMapper.class);
job.setCombinerClass(DateCount.DateCountReducer.class);
job.setReducerClass(DateCount.DateCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
for(int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}

d)数据排序升序 DateSortAsc.java
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class DateSortAsc {
public DateSortAsc() {
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
// 创建job类方法
Job job = Job.getInstance(conf, "Data SortAsc");
// job步骤的配置:
job.setJarByClass(DateSortAsc.class);
// 设置map和reduce的处理逻辑
job.setMapperClass(SortMapper.class);
job.setReducerClass(SortReducer.class);
// 设置输出k1,v1类型
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
// 读取文件
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
// 输出结果:OutPutFormat的子类。写入一个文件中
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
// 定义mapper的处理逻辑
public static class SortMapper extends Mapper<Object, Text, IntWritable, Text> {
private IntWritable num = new IntWritable();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String[] strs = value.toString().split(":");
if (strs.length >= 2) {
num.set(Integer.parseInt(strs[1]));
// 将次数作为key进行升序排序
context.write(num, new Text(strs[0]));
} else {
// 处理格式不正确的数据
System.err.println("Invalid input format: " + value.toString());
}
}
}
public static class SortReducer extends Reducer<IntWritable, Text, Text, IntWritable> {
public void reduce(IntWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values)
context.write(value, key);
}
}
}

e)数据排序升序 DateSortDesc.java
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class DateSortDesc {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
// 创建job类方法
Job job = Job.getInstance(conf, "Data SortDesc");
// job步骤的配置:
job.setJarByClass(DateSortDesc.class);
// 设置比较器
job.setSortComparatorClass(MyComparator.class);
// 设置map和reduce的处理逻辑
job.setMapperClass(SortMapper.class);
job.setReducerClass(SortReducer.class);
// 设置输出k1,v1类型
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
// 读取文件
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
// 输出结果:OutPutFormat的子类。写入一个文件中
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class MyComparator extends WritableComparator {
public MyComparator() {
super(IntWritable.class, true);
}
@Override
@SuppressWarnings({ "rawtypes", "unchecked" })
// 不检查类型
public int compare(WritableComparable a, WritableComparable b) {
// CompareTo方法,返回值为1则降序,-1则升序
// 默认是a.compareTo(b),a比b小返回-1,现在反过来返回1,就变成了降序
return b.compareTo(a);
}
}
// 定义mapper的处理逻辑
public static class SortMapper extends Mapper<Object, Text, IntWritable, Text> {
private IntWritable num = new IntWritable();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String[] strs = value.toString().split(":");
if (strs.length >= 2) {
num.set(Integer.parseInt(strs[1]));
// 将次数作为key进行升序排序
context.write(num, new Text(strs[0]));
} else {
// 处理格式不正确的数据
System.err.println("Invalid input format: " + value.toString());
}
}
}
public static class SortReducer extends Reducer<IntWritable, Text, Text, IntWritable> {
public void reduce(IntWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values) {
context.write(value, key);
}
}
}
}

f)自定义对象序列化 FlowStatistics.java(手机流量),数据参考附件log.txt
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class FlowStatistics {
public FlowStatistics(){
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if(otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
//创建job类方法
Job job = Job.getInstance(conf, "Flow Statistic");
//job步骤的配置:
job.setJarByClass(FlowStatistics.class);
//设置map和reduce的处理逻辑
job.setMapperClass(FlowStatistics.SortMapper.class);
job.setReducerClass(FlowStatistics.SortReducer.class);
//设置输入k1,v1类型
job.setOutputKeyClass(MySortKey.class);
job.setOutputValueClass(Text.class);
for(int i = 0; i < otherArgs.length - 1; ++i) {
//读取文件
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
//输出结果:OutPutFormate的子类。写入一个文件中
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true)?0:1);
}
public static class MySortKey implements WritableComparable<MySortKey> {
private int upFlow;
private int downFlow;
private int sumFlow;
public void FlowSort(int up, int down) {
upFlow = up;
downFlow = down;
sumFlow = up + down;
}
public void setUpFlow(int upFlow) {
this.upFlow=upFlow;
}
public void setDownFlow(int downFlow) {
this.downFlow=downFlow;
}
public void setSunFlow(int sumFlow) {
this.sumFlow=sumFlow;
}
//反序列化
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(upFlow);
out.writeInt(downFlow);
out.writeInt(sumFlow);
}
//实现序列化
@Override
public void readFields(DataInput in) throws IOException {
upFlow = in.readInt();
downFlow = in.readInt();
sumFlow = in.readInt();
}
//重写shuffle过程对key的排序
@Override
public int compareTo(MySortKey o)
{
if ((this.upFlow - o.upFlow) == 0) {// 上行流量相等,比较下行流量
return o.downFlow - this.downFlow;// 按downFlow降序排序
} else
return this.upFlow - o.upFlow;// 按upFlow升序排
}
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + sumFlow;
}
}
public static class SortMapper extends Mapper<Object, Text, MySortKey, Text> {
Text phone = new Text();
MySortKey mySortKey = new MySortKey();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String[] lists = value.toString().split("\t");
phone.set(lists[0]);
mySortKey.setUpFlow(Integer.parseInt(lists[1]));
mySortKey.setDownFlow(Integer.parseInt(lists[2]));
context.write(mySortKey, phone);// 调换手机号和流量计数,后者作为排序键
}
}
public static class SortReducer extends Reducer<MySortKey, Text, Text, MySortKey> {
public void reduce(MySortKey key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
for (Text value : values) {
System.out.println(value.toString()+","+key.toString());
context.write(value, key);// 再次把手机号和流量计数调换
}
}
}
}

4、A.csv和B.csv为2个表,第一列为名字,第二列为年龄,用mapreduce编程实现,两个表的union, intersection, differece运算并输出。
import java.io.IOException;
public class union {
public union() {
}
//
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if(otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
//创建job类方法
Job job = Job.getInstance(conf, "Union");
//job步骤的配置:
job.setJarByClass(union.class);
job.setMapperClass(union.UnionMapper.class);
job.setReducerClass(union.UnionReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
for(int i = 0; i < otherArgs.length - 1; ++i) {
//读取文件
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
//输出结果:OutPutFormate的子类。写入一个文件中
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true)?0:1);
}
//Mapper <Object, Text, Text, IntWritable>:k1,v1,k2,v2的类型
//为了解决序列化问题,hadoop自带的类型
//定义mapper的处理 逻辑
public static class UnionMapper extends Mapper<Object, Text, Text, IntWritable> {
private static final IntWritable grade = new IntWritable();
private Text word = new Text();//封装的string
public UnionMapper() {
}
public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
value.toString().split(",");
String[] info = value.toString().split(",");
word.set(info[0]);
grade.set(Integer.parseInt(info[1]));
context.write(word, grade);
}
}
//导入的:Test,IntWritable类是hadoop.io包下的
//
public static class UnionReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public UnionReducer() {
}
public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
for(IntWritable value:values) {
result.set(value.get());
break;
}
context.write(key, this.result);
}
}
}
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Intersection {
public Intersection() {
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "ABcsv");
job.setJarByClass(Intersection.class);
job.setMapperClass(DateDistinctMapper.class);
/*
* 设置4个输入输出之后必须把这段注释,否则运行不成功
*/
//job.setCombinerClass(DateDistinctReducer.class);
job.setReducerClass(DateDistinctReducer.class);
//set output class
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
public static class DateDistinctMapper extends Mapper<Object, Text, Text, IntWritable> {
public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
/*
* union
* String line = value.toString();
Text text = new Text(line);
context.write(text, NullWritable.get());
*/
String line = value.toString();
Text text = new Text(line);
context.write(text,new IntWritable(1));
}
}
public static class DateDistinctReducer extends Reducer<Text,IntWritable, Text,NullWritable > {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
/*union
* context.write(key, NullWritable.get());
*/
int sum = 0;
for(IntWritable val : values) {
sum += val.get();
}
if(sum >1) {
context.write(key,NullWritable.get());
}
}
}
}
import java.io.IOException;
public class differences {
public differences() {
}
//
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if(otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
//创建job类方法
Job job = Job.getInstance(conf, "Union");
//job步骤的配置:
job.setJarByClass(differences.class);
job.setMapperClass(differences.DiffMapper.class);
job.setReducerClass(differences.DiffReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
for(int i = 0; i < otherArgs.length - 1; ++i) {
//读取文件
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
//输出结果:OutPutFormate的子类。写入一个文件中
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true)?0:1);
}
//Mapper <Object, Text, Text, IntWritable>:k1,v1,k2,v2的类型
//为了解决序列化问题,hadoop自带的类型
//定义mapper的处理 逻辑
public static class DiffMapper extends Mapper<Object, Text, Text, IntWritable> {
private static final IntWritable grade = new IntWritable();
private Text word = new Text();//封装的string
public DiffMapper() {
}
public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
value.toString().split(",");
String[] info = value.toString().split(",");
word.set(info[0]);
grade.set(Integer.parseInt(info[1]));
context.write(word, grade);
}
}
//导入的:Test,IntWritable类是hadoop.io包下的
//
public static class DiffReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public DiffReducer() {
}
public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int count=0;
int val = 0;
for(IntWritable value:values) {
count++;
val= value.get();
}
if(count == 1) {
result.set(val);
context.write(key, this.result);
}
}
}
}
5、Titanic_data为泰坦尼克号数据集,总共包含12列,每列所包含信息如下:
其中Survived列的0表示死亡,1表示幸存。Sex列表示性别。要求用mapreduce计算这次灾难中死亡的男性和女性的平均年龄。
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Titanic {
public Titanic() {
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if(otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
//创建job类方法
Job job = Job.getInstance(conf, "Titainic Avg");
//job步骤的配置:
job.setJarByClass(Titanic.class);
job.setMapperClass(Titanic.MyMapper.class);
job.setReducerClass(Titanic.AgeSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
for(int i = 0; i < otherArgs.length - 1; ++i) {
//读取文件
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
//输出结果:OutPutFormate的子类。写入一个文件中
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true)?0:1);
}
//定义mapper的处理 逻辑
public static class MyMapper extends Mapper<Object, Text, Text, DoubleWritable> {
private static Text men = new Text("男");
private static Text women = new Text("女");
private static DoubleWritable age = new DoubleWritable();
public MyMapper() {
}
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
//防止结尾有空行
String str = value.toString();
if(!str.equals("")) {
String[] strs = value.toString().split(",");
int flag = Integer.parseInt(strs[1]);
if(flag == 0 && (!strs[4].equals("")) && (!strs[5].equals("")) ) {
String sex = strs[4];
age.set(Double.valueOf(strs[5]));
if(sex.equals("male")) {
context.write(men,age);
}else {
context.write(women,age);
}
}
}
}
}
public static class AgeSumReducer extends Reducer<Text, DoubleWritable, Text,DoubleWritable > {
private static DoubleWritable average = new DoubleWritable();
public AgeSumReducer() {
}
public void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
int sum = 0;
for(DoubleWritable value:values) {
count += 1;
sum += value.get();
}
average.set(sum*1.0/count);
context.write(key,average);
}
}
}
实验二、MapReduce编程实践
2.统计grade文件中每位学生分数的平均值
import java.io.IOException;
public class GradeAverage {
public GradeAverage() {
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(GradeAverage.class);
job.setMapperClass(GradeAverage.TokenizerMapper.class);
job.setCombinerClass(GradeAverage.IntSumReducer.class);
job.setReducerClass(GradeAverage.IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class TokenizerMapper extends Mapper<Object, Text, Text, DoubleWritable> {
private DoubleWritable score = new DoubleWritable();
private Text word = new Text();
public TokenizerMapper() {
}
public void map(Object key, Text value, Mapper<Object, Text, Text, DoubleWritable>.Context context) throws IOException, InterruptedException {
String[] tokens = value.toString().split("\t"); // 使用制表符作为分隔符
if (tokens.length == 2) {
this.word.set(tokens[0]); // 学生代码
double scoreValue = Double.parseDouble(tokens[1]); // 分数
score.set(scoreValue);
context.write(this.word, score);
}
}
}
public static class IntSumReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {
private DoubleWritable result = new DoubleWritable();
public IntSumReducer() {
}
public void reduce(Text key, Iterable<DoubleWritable> values, Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context) throws IOException, InterruptedException {
double sum = 0;
int count = 0;
for (DoubleWritable val : values) {
sum += val.get();
count++;
}
double average = sum / count;
result.set(average);
context.write(key, result);
}
}
}
3、score文件中保存有随机生成若干学生的成绩,包括语文、数学、英语,请实现将数据按照总分降序排列,如果总分相同按语文升序排列。
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer;
public class ScoreSort {
public static class ScoreWritable implements WritableComparable<ScoreWritable> {
private int chineseScore;
private int mathScore;
private int englishScore;
public ScoreWritable() {
}
public void set(int chineseScore, int mathScore, int englishScore) {
this.chineseScore = chineseScore;
this.mathScore = mathScore;
this.englishScore = englishScore;
}
@Override
public int compareTo(ScoreWritable other) {
int totalScoreComparison = Integer.compare(other.getTotalScore(), this.getTotalScore());
if (totalScoreComparison != 0) {
return totalScoreComparison;
}
return Integer.compare(this.chineseScore, other.chineseScore);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(chineseScore);
out.writeInt(mathScore);
out.writeInt(englishScore);
}
@Override
public void readFields(DataInput in) throws IOException {
chineseScore = in.readInt();
mathScore = in.readInt();
englishScore = in.readInt();
}
public int getTotalScore() {
return chineseScore + mathScore + englishScore;
}
public int getChineseScore() {
return chineseScore;
}
public int getMathScore() {
return mathScore;
}
public int getEnglishScore() {
return englishScore;
}
@Override
public String toString() {
return chineseScore + "\t" + mathScore + "\t" + englishScore;
}
}
public static class TokenizerMapper extends Mapper<Object, Text, ScoreWritable, Text> {
private ScoreWritable scoreWritable = new ScoreWritable();
private Text name = new Text();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
if (itr.hasMoreTokens()) {
// Assuming the first token is the student name
name.set(itr.nextToken());
if (itr.hasMoreTokens()) {
// Assuming the next three tokens are the scores for Chinese, Math, and English
int chineseScore = Integer.parseInt(itr.nextToken());
int mathScore = Integer.parseInt(itr.nextToken());
int englishScore = Integer.parseInt(itr.nextToken());
// Set the scores in the ScoreWritable object
scoreWritable.set(chineseScore, mathScore, englishScore);
// Emit the key-value pair with ScoreWritable as key and student name as value
context.write(scoreWritable, name);
}
}
}
}
public static class IntSumReducer extends Reducer<ScoreWritable, Text, Text, Text> {
private Text result = new Text();
public void reduce(ScoreWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text val : values) {
// Emit the student name, total score, and individual subject scores
result.set(key.toString() + "\t" + key.getTotalScore());
context.write(val, result);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: studentscoresort <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "student score sort");
job.setJarByClass(ScoreSort.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(ScoreWritable.class);
job.setOutputValueClass(Text.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}