【实践】MapReduce作业+实验二_mapreduce demo任务-CSDN博客

本文链接：https://blog.csdn.net/Wells_wei/article/details/134687360

链接🔗：GitHub - josonle/MapReduce-Demo: Hadoop，MapReduce编程学习练手实例

作业任务：

任务跟MapReduce编程实践(Hadoop3.2.2）-CSDN博客一样，编写Java，生成。jar包。运行。

常用的命令：

1.启动Hadoop

cd /usr/local/hadoop
./sbin/start-dfs.sh

2.删除hdfs下的input\output

./bin/hdfs dfs -rm -r input
./bin/hdfs dfs -rm -r output

3.新建input

./bin/hdfs dfs -mkdir input

4.上传文件

./bin/hdfs dfs -put ./date.txt input

5.运行.jar包

./bin/hadoop jar ./myapp/DateDistinct.jar input output

6.输出

./bin/hdfs dfs -cat output/*

3、从链接打开README.md文件查看文档说明，完成理解分析运行以下代码（部分代码有误，需要自己修改），
a)通过执行rand.sh生成数据

先输入，获取权限

chmod +x helloworld.sh

再用，生成数据，保存为Date.txt

./rand.sh

b)数据去重 DateDistinct.java，最后显示以下效果，只显示不重复的日期(文档中格式有误)

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class DateDistinct {

	public static class DateDistinctMapper extends Mapper<Object, Text, Text, NullWritable> {		
		public void map(Object key, Text value, Context context ) 
				throws IOException, InterruptedException {
	    	String[] strs = value.toString().split(" ");
	    	Text date = new Text(strs[0]);
			context.write(date, NullWritable.get());
	    }
	}
  
	public static class DateDistinctReducer extends Reducer<Text,NullWritable,Text,NullWritable> {
		public void reduce(Text key, Iterable<NullWritable> values, Context context) 
				throws IOException, InterruptedException {
			context.write(key, NullWritable.get());
	    }
	}

	public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if(otherArgs.length < 2) {
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(2);
        }
        Job job = Job.getInstance(conf, "DateDistinct");
        job.setJarByClass(DateDistinct.class);
        job.setMapperClass(DateDistinct.DateDistinctMapper.class);
        job.setCombinerClass(DateDistinct.DateDistinctReducer.class);
        job.setReducerClass(DateDistinct.DateDistinctReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class); 
        for(int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true)?0:1);
        
    }

}

c)数据统计 DateCount.java

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class DateCount {

	public static class DateCountMapper extends Mapper<Object, Text, Text, IntWritable> {		
		private final static IntWritable one = new IntWritable(1);
		
		public void map(Object key, Text value, Context context ) 
				throws IOException, InterruptedException {
	    	String[] strs = value.toString().split(" ");	//按空格分割输入
	    	Text date = new Text(strs[0]);		//获取日期
			context.write(date, one);			//将日期和常数1作为Map输出	
	    }
	}
  
	public static class DateCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
				throws IOException, InterruptedException {
			int sum = 0;
			for (IntWritable val : values) {
				sum += val.get();
			}
			context.write(key, new IntWritable(sum));
	    }
	}

	public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if(otherArgs.length < 2) {
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(2);
        }
        Job job = Job.getInstance(conf, "DateCount");
        job.setJarByClass(DateCount.class);
        job.setMapperClass(DateCount.DateCountMapper.class);
        job.setCombinerClass(DateCount.DateCountReducer.class);
        job.setReducerClass(DateCount.DateCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class); 
        for(int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true)?0:1);
        
    }

}

d)数据排序升序 DateSortAsc.java

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class DateSortAsc {
    public DateSortAsc() {

    }
    
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(2);
        }
        // 创建job类方法
        Job job = Job.getInstance(conf, "Data SortAsc");
        // job步骤的配置：
        job.setJarByClass(DateSortAsc.class);
        // 设置map和reduce的处理逻辑
        job.setMapperClass(SortMapper.class);
        job.setReducerClass(SortReducer.class);
        // 设置输出k1，v1类型
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Text.class);
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            // 读取文件
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        // 输出结果：OutPutFormat的子类。写入一个文件中
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
    
    // 定义mapper的处理逻辑
    public static class SortMapper extends Mapper<Object, Text, IntWritable, Text> {
        private IntWritable num = new IntWritable();

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String[] strs = value.toString().split(":");
            if (strs.length >= 2) {
                num.set(Integer.parseInt(strs[1]));
                // 将次数作为key进行升序排序
                context.write(num, new Text(strs[0]));
            } else {
                // 处理格式不正确的数据
                System.err.println("Invalid input format: " + value.toString());
            }
        }
    }
    
    public static class SortReducer extends Reducer<IntWritable, Text, Text, IntWritable> {

        public void reduce(IntWritable key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            for (Text value : values)
                context.write(value, key);
        }
    }
}

e)数据排序升序 DateSortDesc.java

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class DateSortDesc {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(2);
        }
        // 创建job类方法
        Job job = Job.getInstance(conf, "Data SortDesc");
        // job步骤的配置：
        job.setJarByClass(DateSortDesc.class);
        // 设置比较器
        job.setSortComparatorClass(MyComparator.class);
        // 设置map和reduce的处理逻辑
        job.setMapperClass(SortMapper.class);
        job.setReducerClass(SortReducer.class);
        // 设置输出k1，v1类型
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Text.class);
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            // 读取文件
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        // 输出结果：OutPutFormat的子类。写入一个文件中
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
    
    public static class MyComparator extends WritableComparator {
        public MyComparator() {
            super(IntWritable.class, true);
        }
        
        @Override
        @SuppressWarnings({ "rawtypes", "unchecked" })
        // 不检查类型
        public int compare(WritableComparable a, WritableComparable b) {
            // CompareTo方法，返回值为1则降序，-1则升序
            // 默认是a.compareTo(b)，a比b小返回-1，现在反过来返回1，就变成了降序
            return b.compareTo(a);
        }
    }
    
    // 定义mapper的处理逻辑
    public static class SortMapper extends Mapper<Object, Text, IntWritable, Text> {
        private IntWritable num = new IntWritable();
        
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String[] strs = value.toString().split(":");
            if (strs.length >= 2) {
                num.set(Integer.parseInt(strs[1]));
                // 将次数作为key进行升序排序
                context.write(num, new Text(strs[0]));
            } else {
                // 处理格式不正确的数据
                System.err.println("Invalid input format: " + value.toString());
            }
        }
    }
    
    public static class SortReducer extends Reducer<IntWritable, Text, Text, IntWritable> {
        
        public void reduce(IntWritable key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            for (Text value : values) {
                context.write(value, key);
            }
        }
    }
}

f)自定义对象序列化 FlowStatistics.java（手机流量），数据参考附件log.txt

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class FlowStatistics {
	public FlowStatistics(){
		
	}
	public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if(otherArgs.length < 2) {
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(2);
        }
        //创建job类方法 
        Job job = Job.getInstance(conf, "Flow Statistic");
        //job步骤的配置：
        job.setJarByClass(FlowStatistics.class);
        //设置map和reduce的处理逻辑
        job.setMapperClass(FlowStatistics.SortMapper.class);
        job.setReducerClass(FlowStatistics.SortReducer.class);    
        //设置输入k1，v1类型
        job.setOutputKeyClass(MySortKey.class);
        job.setOutputValueClass(Text.class); 
        for(int i = 0; i < otherArgs.length - 1; ++i) {
        	//读取文件
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        } 
        //输出结果：OutPutFormate的子类。写入一个文件中
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }
	
	
	public static class MySortKey implements WritableComparable<MySortKey> {
		private int upFlow;
		private int downFlow;
		private int sumFlow;

		public void FlowSort(int up, int down) {
		upFlow = up;
		downFlow = down;
		sumFlow = up + down;
		}
		public void setUpFlow(int upFlow) {
			this.upFlow=upFlow;
		}
		public void setDownFlow(int downFlow) {
			this.downFlow=downFlow;
		}
		public void setSunFlow(int sumFlow) {
			this.sumFlow=sumFlow;
		}
		//反序列化
		@Override
		public void write(DataOutput out) throws IOException {
			out.writeInt(upFlow);
			out.writeInt(downFlow);
			out.writeInt(sumFlow);
		}
		
		//实现序列化
		@Override
		public void readFields(DataInput in) throws IOException {
		upFlow = in.readInt();
		downFlow = in.readInt();
		sumFlow = in.readInt();
		}

		//重写shuffle过程对key的排序
		@Override
		public int compareTo(MySortKey o) 
		{
		if ((this.upFlow - o.upFlow) == 0) {// 上行流量相等，比较下行流量
			return o.downFlow - this.downFlow;// 按downFlow降序排序
		} else 
			return this.upFlow - o.upFlow;// 按upFlow升序排    
		}

		@Override
		public String toString() {
			return upFlow + "\t" + downFlow + "\t" + sumFlow;
		}
	}
		public static class SortMapper extends Mapper<Object, Text, MySortKey, Text> {
		Text phone = new Text();
		MySortKey mySortKey = new MySortKey();

			public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
				String[] lists = value.toString().split("\t");
				phone.set(lists[0]);
				mySortKey.setUpFlow(Integer.parseInt(lists[1]));
				mySortKey.setDownFlow(Integer.parseInt(lists[2]));
				context.write(mySortKey, phone);// 调换手机号和流量计数，后者作为排序键
			}
		}

		public static class SortReducer extends Reducer<MySortKey, Text, Text, MySortKey> {
			public void reduce(MySortKey key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
				for (Text value : values) {
					System.out.println(value.toString()+","+key.toString());
					context.write(value, key);// 再次把手机号和流量计数调换
				}
			}
		}
}

4、A.csv和B.csv为2个表，第一列为名字，第二列为年龄，用mapreduce编程实现，两个表的union, intersection, differece运算并输出。

import java.io.IOException;
public class union {
	public union() {
    }
	//
     public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if(otherArgs.length < 2) {
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(2);
        }
        //创建job类方法 
        Job job = Job.getInstance(conf, "Union");
        //job步骤的配置：
        job.setJarByClass(union.class);
        job.setMapperClass(union.UnionMapper.class);
        job.setReducerClass(union.UnionReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class); 
        for(int i = 0; i < otherArgs.length - 1; ++i) {
        	//读取文件
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        } 
        //输出结果：OutPutFormate的子类。写入一个文件中
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }
     
     	//Mapper <Object, Text, Text, IntWritable>：k1，v1,k2,v2的类型
      //为了解决序列化问题，hadoop自带的类型
     
     //定义mapper的处理 逻辑
    public static class UnionMapper extends Mapper<Object, Text, Text, IntWritable> {
        private static final IntWritable grade = new IntWritable();
        private Text word = new Text();//封装的string
        public UnionMapper() {
        }
        public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
           value.toString().split(",");
       	 String[] info = value.toString().split(",");
       		word.set(info[0]);
       		grade.set(Integer.parseInt(info[1]));
       	 	context.write(word, grade);    	 
        }
    }
    
    //导入的：Test,IntWritable类是hadoop.io包下的
    //
    public static class UnionReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();
        public UnionReducer() {
        }
        public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            for(IntWritable value:values)  {
            	result.set(value.get());
                break;
            }
            context.write(key, this.result);
        }
    }
}


import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Intersection {
    public Intersection() {
    }
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = new Job(conf, "ABcsv");
        job.setJarByClass(Intersection.class);

        job.setMapperClass(DateDistinctMapper.class);
        
        /*
         * 设置4个输入输出之后必须把这段注释，否则运行不成功
         */
        //job.setCombinerClass(DateDistinctReducer.class);
        
        job.setReducerClass(DateDistinctReducer.class);
        //set output class
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);
    }
    public static class DateDistinctMapper extends Mapper<Object, Text, Text, IntWritable> {

        public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        	
        	/*
        	 * union
        	 * String line = value.toString();
            Text text = new Text(line);
            context.write(text, NullWritable.get());
        	 */
        	
        	
        	 String line = value.toString();
        	 Text text = new Text(line);
             context.write(text,new IntWritable(1));
        	 
        	
        }

    }
    public static class DateDistinctReducer extends Reducer<Text,IntWritable, Text,NullWritable > {
        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
        		
        		/*union
        		 * context.write(key, NullWritable.get());
        		 */
        	
        	
        	 int sum = 0;
        	for(IntWritable val : values) {
        		sum += val.get();
        	}
        	if(sum >1) {  	
            	context.write(key,NullWritable.get());
        	}
        	
        	
        	
        }
    }
}

import java.io.IOException;
public class differences {
	public differences() {
    }
	//
     public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if(otherArgs.length < 2) {
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(2);
        }
        //创建job类方法 
        Job job = Job.getInstance(conf, "Union");
        //job步骤的配置：
        job.setJarByClass(differences.class);
        job.setMapperClass(differences.DiffMapper.class);
        job.setReducerClass(differences.DiffReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class); 
        for(int i = 0; i < otherArgs.length - 1; ++i) {
        	//读取文件
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        } 
        //输出结果：OutPutFormate的子类。写入一个文件中
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }
     
     	//Mapper <Object, Text, Text, IntWritable>：k1，v1,k2,v2的类型
      //为了解决序列化问题，hadoop自带的类型
     
     //定义mapper的处理 逻辑
    public static class DiffMapper extends Mapper<Object, Text, Text, IntWritable> {
        private static final IntWritable grade = new IntWritable();
        private Text word = new Text();//封装的string
        public DiffMapper() {
        }
        public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
           value.toString().split(",");
       	 String[] info = value.toString().split(",");
       		word.set(info[0]);
       		grade.set(Integer.parseInt(info[1]));
       	 	context.write(word, grade);    	 
        }
    }
    
    //导入的：Test,IntWritable类是hadoop.io包下的
    //
    public static class DiffReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();
        public DiffReducer() {
        }
        public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            int count=0;
            int val = 0;
        	for(IntWritable value:values) {  
            	count++;
            	val= value.get();
            	}
        	if(count == 1) {
        		result.set(val);
        		context.write(key, this.result);
        	}            
        }
    }
}

5、Titanic_data为泰坦尼克号数据集，总共包含12列，每列所包含信息如下：
其中Survived列的0表示死亡，1表示幸存。Sex列表示性别。要求用mapreduce计算这次灾难中死亡的男性和女性的平均年龄。

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Titanic {
	public Titanic() {
    }
     public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if(otherArgs.length < 2) {
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(2);
        }
        //创建job类方法 
        Job job = Job.getInstance(conf, "Titainic Avg");
        //job步骤的配置：
        job.setJarByClass(Titanic.class);
        job.setMapperClass(Titanic.MyMapper.class);
        job.setReducerClass(Titanic.AgeSumReducer.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class); 
        for(int i = 0; i < otherArgs.length - 1; ++i) {
        	//读取文件
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        } 
        //输出结果：OutPutFormate的子类。写入一个文件中
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }
     
   //定义mapper的处理 逻辑
 	public static class MyMapper extends Mapper<Object, Text, Text, DoubleWritable> {
 		private static Text men = new Text("男");
 		private static Text women = new Text("女");
 		private static DoubleWritable age = new DoubleWritable();
 		public MyMapper() {
 			
 		}
 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 			//防止结尾有空行
 			String str = value.toString();
 			if(!str.equals("")) {
 				String[] strs = value.toString().split(",");

 				int flag = Integer.parseInt(strs[1]);
 				if(flag == 0 && (!strs[4].equals("")) && (!strs[5].equals("")) ) {
 					String sex = strs[4];

 					age.set(Double.valueOf(strs[5]));
 					if(sex.equals("male")) {
 						context.write(men,age);
 					}else {
 						context.write(women,age);
 					}
 				}
 			}
 		}
 	}
 	public static class AgeSumReducer extends Reducer<Text, DoubleWritable, Text,DoubleWritable > {
 		private static DoubleWritable average = new DoubleWritable();
 		public AgeSumReducer() {
 			
 		}
 		public void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
 			int count = 0;
 			int sum = 0;
 		for(DoubleWritable value:values) {
 			count += 1;
 			sum += value.get();
 		}
 		average.set(sum*1.0/count);
 		context.write(key,average);
 		}
     }
            
}

实验二、MapReduce编程实践

2.统计grade文件中每位学生分数的平均值

import java.io.IOException;

public class GradeAverage {
    public GradeAverage() {
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(2);
        }
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(GradeAverage.class);
        job.setMapperClass(GradeAverage.TokenizerMapper.class);
        job.setCombinerClass(GradeAverage.IntSumReducer.class);
        job.setReducerClass(GradeAverage.IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

    public static class TokenizerMapper extends Mapper<Object, Text, Text, DoubleWritable> {
        private DoubleWritable score = new DoubleWritable();
        private Text word = new Text();

        public TokenizerMapper() {
        }

        public void map(Object key, Text value, Mapper<Object, Text, Text, DoubleWritable>.Context context) throws IOException, InterruptedException {
            String[] tokens = value.toString().split("\t");  // 使用制表符作为分隔符
            if (tokens.length == 2) {
                this.word.set(tokens[0]);  // 学生代码
                double scoreValue = Double.parseDouble(tokens[1]);  // 分数
                score.set(scoreValue);
                context.write(this.word, score);
            }
        }
    }

    public static class IntSumReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {
        private DoubleWritable result = new DoubleWritable();

        public IntSumReducer() {
        }

        public void reduce(Text key, Iterable<DoubleWritable> values, Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context) throws IOException, InterruptedException {
            double sum = 0;
            int count = 0;
            for (DoubleWritable val : values) {
                sum += val.get();
                count++;
            }
            double average = sum / count;
            result.set(average);
            context.write(key, result);
        }
    }
}

3、score文件中保存有随机生成若干学生的成绩，包括语文、数学、英语，请实现将数据按照总分降序排列，如果总分相同按语文升序排列。

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer;

public class ScoreSort {

    public static class ScoreWritable implements WritableComparable<ScoreWritable> {
        private int chineseScore;
        private int mathScore;
        private int englishScore;

        public ScoreWritable() {
        }

        public void set(int chineseScore, int mathScore, int englishScore) {
            this.chineseScore = chineseScore;
            this.mathScore = mathScore;
            this.englishScore = englishScore;
        }

        @Override
        public int compareTo(ScoreWritable other) {
            int totalScoreComparison = Integer.compare(other.getTotalScore(), this.getTotalScore());
            if (totalScoreComparison != 0) {
                return totalScoreComparison;
            }
            return Integer.compare(this.chineseScore, other.chineseScore);
        }

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeInt(chineseScore);
            out.writeInt(mathScore);
            out.writeInt(englishScore);
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            chineseScore = in.readInt();
            mathScore = in.readInt();
            englishScore = in.readInt();
        }

        public int getTotalScore() {
            return chineseScore + mathScore + englishScore;
        }

        public int getChineseScore() {
            return chineseScore;
        }

        public int getMathScore() {
            return mathScore;
        }

        public int getEnglishScore() {
            return englishScore;
        }

        @Override
        public String toString() {
            return chineseScore + "\t" + mathScore + "\t" + englishScore;
        }
    }

    public static class TokenizerMapper extends Mapper<Object, Text, ScoreWritable, Text> {
        private ScoreWritable scoreWritable = new ScoreWritable();
        private Text name = new Text();

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());

            if (itr.hasMoreTokens()) {
                // Assuming the first token is the student name
                name.set(itr.nextToken());

                if (itr.hasMoreTokens()) {
                    // Assuming the next three tokens are the scores for Chinese, Math, and English
                    int chineseScore = Integer.parseInt(itr.nextToken());
                    int mathScore = Integer.parseInt(itr.nextToken());
                    int englishScore = Integer.parseInt(itr.nextToken());

                    // Set the scores in the ScoreWritable object
                    scoreWritable.set(chineseScore, mathScore, englishScore);

                    // Emit the key-value pair with ScoreWritable as key and student name as value
                    context.write(scoreWritable, name);
                }
            }
        }
    }

    public static class IntSumReducer extends Reducer<ScoreWritable, Text, Text, Text> {
        private Text result = new Text();

        public void reduce(ScoreWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            for (Text val : values) {
                // Emit the student name, total score, and individual subject scores
                result.set(key.toString() + "\t" + key.getTotalScore());
                context.write(val, result);
            }
        }
    }


    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: studentscoresort <in> [<in>...] <out>");
            System.exit(2);
        }
        Job job = Job.getInstance(conf, "student score sort");
        job.setJarByClass(ScoreSort.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(ScoreWritable.class);
        job.setOutputValueClass(Text.class);

        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}