MapTask的shuffle流程(收集器，分区器，缓冲区对象，排序器，比较器，合并器详解)

本文链接：https://blog.csdn.net/ygp12345/article/details/109335799

文章目录

MapTask总流程说明

使用TextInputFormat的RecordReader去读取每一行，将key和value封装好，然后交给Mapper。然后Mapper进行逻辑运算，map(key,value)和Context.write(key,value),然后这个write之前的所有步骤都称为map阶段，write之后的阶段为sort阶段该write将key-value写入到缓冲区，当达到80%时再从缓冲区写出，不过再写出之前，会先排序，然后以分区写到文件，然后将多个临时文件按分区进行合并。。

那么收集到缓冲区会收集哪些信息呢？有索引，先写入缓冲区的数据索引就小，后写入的就大；然后还有一个分区号partrition，这个标记着将来是由哪个ReduceTask去处理；还会记录key的起始偏移量和value的起始偏移量；当然还会存真实的记录数据。
在这里插入图片描述

MapTask的shuffle细节

直白点来说map输出到reduce输入的中间过程就是shuffle阶段，那么现在就来说下shuffle做了什么：

1.记录输出收集器的赋值

if (job.getNumReduceTasks() == 0) {
      output = 
        new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
    } else {
      output = new NewOutputCollector(taskContext, job, umbilical, reporter);
    }

如果没有Reduce阶段，使用直接的记录收集器，它不会读数据进行排序！按照Mapper输出的顺序输出！
如果有Reduce阶段，使用NewOutputCollector来收集记录！

2. MapTask记录输出收集器的初始化

NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,
                       JobConf job,
                       TaskUmbilicalProtocol umbilical,
                       TaskReporter reporter
                       ) throws IOException, ClassNotFoundException {
  //真正干活的收集器，缓存区对象，这个缓冲区对其会对收集的记录进行排序
      collector = createSortingCollector(job, reporter);
      // 获取当前job所有的reduceTask数量，以此数量作为总的分区数，默认JobReduceTask的数量为1
partitions = jobContext.getNumReduceTasks();
      // 为MapTask使用的分区器进行赋值
if (partitions > 1) {
        partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
          ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
      } else {
// 默认使用此分区器进行分区，这个分区器将所有的key-value都分到0号区,所以我们看到的输出文件名是part0000
        partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
          @Override
          public int getPartition(K key, V value, int numPartitions) {
            return partitions - 1;
          }
        };
      }
    }

3.获取Partitioner

@SuppressWarnings("unchecked")
  public Class<? extends Partitioner<?,?>> getPartitionerClass() 
     throws ClassNotFoundException {
    return (Class<? extends Partitioner<?,?>>) 
      conf.getClass(PARTITIONER_CLASS_ATTR, HashPartitioner.class);
  }

从配置中获取mapreduce.job.partitioner.class参数，如果没有设置，就使用HashPartitioner作为分区器！

public class HashPartitioner<K, V> extends Partitioner<K, V> {

  /** Use {@link Object#hashCode()} to partition. */
  public int getPartition(K key, V value,int numReduceTasks) {
  //下面的式子是计算分区号的具体算法
    return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
  }

}

4.缓冲区对象的初始化

如果有Reduce阶段，在MapTask中使用MapOutPutBuffer作为缓冲区的实现类！

public void init(MapOutputCollector.Context context
                    ) throws IOException, ClassNotFoundException {
                    // 获取当前缓冲区溢写的阀值，默认读取配置中mapreduce.map.sort.spill.percent
没有配置，默认为0.8
 final float spillper =        job.getFloat(JobContext.MAP_SORT_SPILL_PERCENT, (float)0.8);
     //获取缓冲区的初始大小，读取mapreduce.task.io.sort.mb，如果没有配置，默认为100
 final int sortmb = job.getInt(JobContext.IO_SORT_MB, 100);
      indexCacheMemoryLimit = job.getInt(JobContext.INDEX_CACHE_MEMORY_LIMIT,
                                         INDEX_CACHE_MEMORY_LIMIT_DEFAULT);
      if (spillper > (float)1.0 || spillper <= (float)0.0) {
        throw new IOException("Invalid \"" + JobContext.MAP_SORT_SPILL_PERCENT +
            "\": " + spillper);
      }
      if ((sortmb & 0x7FF) != sortmb) {
        throw new IOException(
            "Invalid \"" + JobContext.IO_SORT_MB + "\": " + sortmb);
      }
// 实例化排序器，默认使用快排，只排索引
      sorter = ReflectionUtils.newInstance(job.getClass("map.sort.class",QuickSort.class, IndexedSorter.class), job);
       // k/v serialization
      comparator = job.getOutputKeyComparator();
  // 根据Mapper输出的Key-value类型，获取序列化器
// 如果Mapper的输出Key-value实现了Wriable接口，Hadoop自动提供序列化器
// 如果Mapper输出的key-value没有实现Wriable接口，需要自定提供序列化器，设置到Job中
      keyClass = (Class<K>)job.getMapOutputKeyClass();
      valClass = (Class<V>)job.getMapOutputValueClass();
      serializationFactory = new SerializationFactory(job);
      keySerializer = serializationFactory.getSerializer(keyClass);
      keySerializer.open(bb);
      valSerializer = serializationFactory.getSerializer(valClass);
      valSerializer.open(bb);

   //MapTask输出的记录可以使用压缩格式，到ReduceTask时，再解压缩
  // 压缩可以节省磁盘IO和网络IO，提供MR的运行效率
      // compression
      if (job.getCompressMapOutput()) {
        Class<? extends CompressionCodec> codecClass =
          job.getMapOutputCompressorClass(DefaultCodec.class);
        codec = ReflectionUtils.newInstance(codecClass, job);
      } else {
        codec = null;
      }

      // combiner
      final Counters.Counter combineInputCounter =
        reporter.getCounter(TaskCounter.COMBINE_INPUT_RECORDS);
      combinerRunner = CombinerRunner.create(job,getTaskID(),                                          combineInputCounter,reporter, null);
      if (combinerRunner != null) {
        final Counters.Counter combineOutputCounter =
          reporter.getCounter(TaskCounter.COMBINE_OUTPUT_RECORDS);
        combineCollector= new CombineOutputCollector<K,V>(combineOutputCounter, reporter, job);
      } else {
        combineCollector = null;
      }
      spillInProgress = false;
      minSpillsForCombine = job.getInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
      spillThread.setDaemon(true);
      spillThread.setName("SpillThread");
      spillLock.lock();
      try {
        spillThread.start();
        while (!spillThreadRunning) {
          spillDone.await();
        }
      } catch (InterruptedException e) {
        throw new IOException("Spill thread failed to initialize", e);
      } finally {
        spillLock.unlock();
      }
      if (sortSpillException != null) {
        throw new IOException("Spill thread failed to initialize",
            sortSpillException);
      }
    }

5. 获取Mapper输出的key的比较器

public RawComparator getOutputKeyComparator() {
// 从配置中获取mapreduce.job.output.key.comparator.class的值，必须是RawComparator类型，
如果没有配置，默认为null
    Class<? extends RawComparator> theClass = getClass(
      JobContext.KEY_COMPARATOR, null, RawComparator.class);
// 一旦用户配置了此参数，实例化一个用户自定义的比较器实例
    if (theClass != null)
      return ReflectionUtils.newInstance(theClass, this);
//用户没有配置，判断Mapper输出的key的类型是否是WritableComparable的子类，如果不是，就抛异常，如果是，系统会自动为我们提供一个key的比较器
    return WritableComparator.get(getMapOutputKeyClass().asSubclass(WritableComparable.class), this);
  }

如何自定义比较器：(两种思路)
①自定义类，这个类必须是RawComparator类型，通过设置mapreduce.job.output.key.comparator.class=自定义的类的类型。
自定义类时，可以继承WriableComparator，也可以实现RawCompartor!
调用方法时，先调用RawCompartor. compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2),再调用RawCompartor.compare()
或

②定义Mapper输出的key,让key实现WritableComparable,实现CompareTo()

6.Combiner

Combiner实际上本质是一个Reducer类！
Conbiner只有在设置了之后，才会运行！

Combiner和Reducer的区别：
Reducer是在reduce阶段调用！
Combiner是在shuffle阶段调用！

combiner既可能在MapTask端调用，有可能在ReduceTask端调用。

本质都是Reducer类，作用都是对有相同key的key-value进行合并！
意义：在shuffle阶段对相同key的key-value进行提前合并，可以减少磁盘IO和网络IO！
使用条件： Combiner用在+，-操作的场景，不能用在*,/操作的场景！如下面求平均工资，如果只用reducer，则平均工资1400，如果用了combiner，平均工资就是1375，所以乘除场景会使结果不对。
在这里插入图片描述
我们在driver里设置一下combiner：参数是我写的Reducer类，然后我们可以看到用了combiner后，效率提升了。

使用Combiner必须保证不能影响处理逻辑和结果！
Combiner可以优化MR程序，提高运行效率！

Combiner既有可能在MapTask端调用：
①每次溢写前会调用Combiner对溢写的数据进行局部合并
②在merge时，如果溢写的片段数>=3，如果设置了Combiner，Combiner会再次对数据进行Combine！

Combiner既有可能在ReduceTask端调用：
③shuffle线程拷贝多个MapTask同一分区的数据，拷贝后执行merge和sort,
如果数据量过大，需要将部分数据先合并排序后，溢写到磁盘！
如果设置了Combiner，Combiner会再次运行！

7.总结

①分区
a)总的分区数取决于reduceTask的数量！
一个Job要启动几个reduceTask，取决于期望产生几个分区，每个分区最后都会生成一个结果文件！
b)Partionner的确定
reduceTask>1，尝试获取用户设置的Partionner，如果没有设置使用HashPartitoner
reduceTask<=1,系统默认提供一个Partionner，它会将所有记录都分到0号区

②排序
每次溢写前，使用快速排序
最后merge时，使用归并排序
③比较器
排序时，根据比较器比较的结果进行排序！
a)如果用户自定义了比较器，MR就使用用户自定义的比较器(RawComparator类型)
b)如果用户没有自定义，那么Mapper输出的Key需要实现WriableComparable接口
系统会自动提供比较器

总结： 不管是自己提供比较器还是实现WriableComparable接口，最后在比较时，都是调用
		自己实现的CompareTo()

④Combiner
Combiner在shuffle阶段运行！
a)每次溢写前会调用Combiner对溢写的数据进行局部合并
b)在merge时，如果溢写的片段数>=3，如果设置了Combiner，Combiner会再次对
数据进行Combine！

⑤执行流程
a)Partitioner计算分区
b)满足溢写条件，所有数据进行排序，排序时用比较器对比key
每次溢写前的排序，默认使用快排
如果设置Combiner，在溢写前，排好序的结果会先被Combiner进行combine
再溢写
c) b过程会发生N次
d) 所有的溢写片段需要merge为一个总的文件
合并时，使用归并排序，对key进行排序！
如果溢写片段数量超过3，在溢写成一个最终的文件时，Combiner再次调用，
执行Combine，combine后再溢写！

自定义分区案例

比如有一个文件夹，里面有一些文件，然后文件里每一行记录保存的都是手机号，上行流量，下行流量等，现在我要把这些文件里的手机号提取出来，规定135，136，137，138的手机号各自放在4个文件中，其他的手机号放在第五个文件中，那么我们需要自定义分区器，现在输出的是5个文件，也就是五个分区，我们要在driver里面加入如下代码。MyPartitioner是我的自定义分区器。

// 设置ReduceTask的数量为5
		job.setNumReduceTasks(5);
		// 设置使用自定义的分区器
		job.setPartitionerClass(MyPartitioner.class);

好，现在来看看我的自定义分区器怎么写？
【注】FlowBean是我将输入文件夹的每一行封装成对象了，详情请看统计手机流量。

public class MyPartitioner extends Partitioner<Text, FlowBean>{

	// 计算分区  numPartitions为总的分区数，reduceTask的数量
	// 分区号必须为int型的值，且必须符合 0<= partitionNum < numPartitions
	@Override
	public int getPartition(Text key, FlowBean value, int numPartitions) {
		
		String suffix = key.toString().substring(0, 3);
		
		int partitionNum=0;
		
		switch (suffix) {
		case "136":
			partitionNum=numPartitions-1;
			break;
		case "137":
			partitionNum=numPartitions-2;
			break;
		case "138":
			partitionNum=numPartitions-3;
			break;
		case "139":
			partitionNum=numPartitions-4;
			break;

		default:
			break;
		}

		return partitionNum;
	}

}

排序案例

方法一(继承WritableComparator类)

需求，对总流量进行降序排序，我们必须要自己定义比较器，因为总流量的类型是LongWritable，这个类型实现的比较器默认是升序。【注】先参考这篇文章流量统计

我们的输入是根据上述参考文章的最终结果来的：
在这里插入图片描述

Mapper

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/*
 ***数据格式  13470253144	180	180	360**
 * 
 * 
 * 
 */
public class FlowBeanMapper extends Mapper<LongWritable, Text, LongWritable, Text>{
	
	private LongWritable out_key=new LongWritable();
	private Text out_value=new Text();
	
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		
		String[] words = value.toString().split("\t");
		
		//封装总流量为key
		out_key.set(Long.parseLong(words[3]));
		
		out_value.set(words[0]+"\t"+words[1]+"\t"+words[2]);
		
		context.write(out_key, out_value);
	
	}

}

Reducer

mport java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class FlowBeanReducer extends Reducer<LongWritable, Text, Text, LongWritable>{
	
	@Override
	protected void reduce(LongWritable key, Iterable<Text> values,
			Reducer<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
		
		for (Text value : values) {
			
			context.write(value, key);
			
		}
		
	}
	
	
}

Driver

public class FlowBeanDriver {
	
	public static void main(String[] args) throws Exception {
		
		Path inputPath=new Path("E:\\mroutput\\flowbean");
		Path outputPath=new Path("e:/mroutput/flowbeanSort1");
		
		//作为整个Job的配置
		Configuration conf = new Configuration();
		
		//保证输出目录不存在
		FileSystem fs=FileSystem.get(conf);
		
		if (fs.exists(outputPath)) {
			
			fs.delete(outputPath, true);
			
		}
		
		// ①创建Job
		Job job = Job.getInstance(conf);
		
		// ②设置Job
		// 设置Job运行的Mapper，Reducer类型，Mapper,Reducer输出的key-value类型
		job.setMapperClass(FlowBeanMapper.class);
		job.setReducerClass(FlowBeanReducer.class);
		
		// Job需要根据Mapper和Reducer输出的Key-value类型准备序列化器，通过序列化器对输出的key-value进行序列化和反序列化
		// 如果Mapper和Reducer输出的Key-value类型一致，直接设置Job最终的输出类型
		//现在Mapper和Reducer的输出不一致，那么就需要挨个设置
		job.setMapOutputKeyClass(LongWritable.class);
		job.setMapOutputValueClass(Text.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		// 设置输入目录和输出目录
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		// 设置使用自定义的比较器
		job.setSortComparatorClass(DecreasingComparator.class);
		
		// ③运行Job
		job.waitForCompletion(true);
		
		
	}

}

DecreasingComparator

import org.apache.hadoop.io.WritableComparator;

public class MyDescComparator extends WritableComparator{
	
	@Override
    public int compare(byte[] b1, int s1, int l1,
                       byte[] b2, int s2, int l2) {
      long thisValue = readLong(b1, s1);
      long thatValue = readLong(b2, s2);
      return (thisValue<thatValue ? 1 : (thisValue==thatValue ? 0 : -1));
    }

}

方法二(实现WritableComparable接口)

FlowBean

public class FlowBean implements WritableComparable<FlowBean>{
	
	private long upFlow;
	private long downFlow;
	private Long sumFlow;
	
	public FlowBean() {
		
	}

	public long getUpFlow() {
		return upFlow;
	}

	public void setUpFlow(long upFlow) {
		this.upFlow = upFlow;
	}

	public long getDownFlow() {
		return downFlow;
	}

	public void setDownFlow(long downFlow) {
		this.downFlow = downFlow;
	}

	public long getSumFlow() {
		return sumFlow;
	}

	public void setSumFlow(long sumFlow) {
		this.sumFlow = sumFlow;
	}

	// 序列化   在写出属性时，如果为引用数据类型，属性不能为null
	@Override
	public void write(DataOutput out) throws IOException {
		
		out.writeLong(upFlow);
		out.writeLong(downFlow);
		out.writeLong(sumFlow);
		
		
	}

	//反序列化   序列化和反序列化的顺序要一致
	@Override
	public void readFields(DataInput in) throws IOException {
		upFlow=in.readLong();
		downFlow=in.readLong();
		sumFlow=in.readLong();
		
	}

	@Override
	public String toString() {
		return  upFlow + "\t" + downFlow + "\t" + sumFlow;
	}

	// 系统封装的比较器在对比key时，调用key的compareTo进行比较
	// 降序比较总流量
	@Override
	public int compareTo(FlowBean o) {
		
		return -this.sumFlow.compareTo(o.getSumFlow());
	}
	
	

}

Mapper

public class FlowBeanMapper extends Mapper<LongWritable, Text, FlowBean, Text>{
	
	private FlowBean out_key=new FlowBean();
	private Text out_value=new Text();
	
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		
		String[] words = value.toString().split("\t");
		
		//封装总流量为key
		out_key.setUpFlow(Long.parseLong(words[1]));
		out_key.setDownFlow(Long.parseLong(words[2]));
		out_key.setSumFlow(Long.parseLong(words[3]));
		
		out_value.set(words[0]);
		
		context.write(out_key, out_value);
	
	}

}

Reducer

public class FlowBeanReducer extends Reducer<FlowBean, Text, Text, FlowBean>{
	
	@Override
	protected void reduce(FlowBean key, Iterable<Text> values,
			Reducer<FlowBean, Text, Text, FlowBean>.Context context) throws IOException, InterruptedException {
		
		for (Text value : values) {
			
			context.write(value, key);
			
		}
		
	}
	
	
}

Driver

public class FlowBeanDriver {
	
	public static void main(String[] args) throws Exception {
		
		Path inputPath=new Path("E:\\mroutput\\flowbean");
		Path outputPath=new Path("e:/mroutput/flowbeanSort2");
		
		//作为整个Job的配置
		Configuration conf = new Configuration();
		
		//保证输出目录不存在
		FileSystem fs=FileSystem.get(conf);
		
		if (fs.exists(outputPath)) {
			
			fs.delete(outputPath, true);
			
		}
		
		// ①创建Job
		Job job = Job.getInstance(conf);
		
		// ②设置Job
		// 设置Job运行的Mapper，Reducer类型，Mapper,Reducer输出的key-value类型
		job.setMapperClass(FlowBeanMapper.class);
		job.setReducerClass(FlowBeanReducer.class);
		
		// Job需要根据Mapper和Reducer输出的Key-value类型准备序列化器，通过序列化器对输出的key-value进行序列化和反序列化
		// 如果Mapper和Reducer输出的Key-value类型一致，直接设置Job最终的输出类型
		
		job.setMapOutputKeyClass(FlowBean.class);
		job.setMapOutputValueClass(Text.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		// 设置输入目录和输出目录
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		
		// ③运行Job
		job.waitForCompletion(true);
		
		
	}

}

当然，我也可以不需要自己新建一个降序比较器，因为LongWritable自己实现了一个降序的比较器，我只用在driver中设置一下就行了。
在这里插入图片描述

方法三(实现RawComparator接口)

FlowBean

public class FlowBean implements Writable{
	
	private long upFlow;
	private long downFlow;
	private Long sumFlow;
	
	public FlowBean() {
		
	}

	public long getUpFlow() {
		return upFlow;
	}

	public void setUpFlow(long upFlow) {
		this.upFlow = upFlow;
	}

	public long getDownFlow() {
		return downFlow;
	}

	public void setDownFlow(long downFlow) {
		this.downFlow = downFlow;
	}

	public Long getSumFlow() {
		return sumFlow;
	}

	public void setSumFlow(long sumFlow) {
		this.sumFlow = sumFlow;
	}

	// 序列化   在写出属性时，如果为引用数据类型，属性不能为null
	@Override
	public void write(DataOutput out) throws IOException {
		
		out.writeLong(upFlow);
		out.writeLong(downFlow);
		out.writeLong(sumFlow);
		
		
	}

	//反序列化   序列化和反序列化的顺序要一致
	@Override
	public void readFields(DataInput in) throws IOException {
		upFlow=in.readLong();
		downFlow=in.readLong();
		sumFlow=in.readLong();
		
	}

	@Override
	public String toString() {
		return  upFlow + "\t" + downFlow + "\t" + sumFlow;
	}

	/*// 系统封装的比较器在对比key时，调用key的compareTo进行比较
	// 降序比较总流量
	@Override
	public int compareTo(FlowBean o) {
		
		return -this.sumFlow.compareTo(o.getSumFlow());
	}
	*/
	

}

Mapper

public class FlowBeanMapper extends Mapper<LongWritable, Text, FlowBean, Text>{
	
	private FlowBean out_key=new FlowBean();
	private Text out_value=new Text();
	
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		
		String[] words = value.toString().split("\t");
		
		//封装总流量为key
		out_key.setUpFlow(Long.parseLong(words[1]));
		out_key.setDownFlow(Long.parseLong(words[2]));
		out_key.setSumFlow(Long.parseLong(words[3]));
		
		out_value.set(words[0]);
		
		context.write(out_key, out_value);
	
	}

}

Reducer

public class FlowBeanReducer extends Reducer<FlowBean, Text, Text, FlowBean>{
	
	@Override
	protected void reduce(FlowBean key, Iterable<Text> values,
			Reducer<FlowBean, Text, Text, FlowBean>.Context context) throws IOException, InterruptedException {
		
		for (Text value : values) {
			
			context.write(value, key);
			
		}
		
	}
	
	
}

Driver

public class FlowBeanDriver {
	
	public static void main(String[] args) throws Exception {
		
		Path inputPath=new Path("E:\\mroutput\\flowbean");
		Path outputPath=new Path("e:/mroutput/flowbeanSort3");
		
		//作为整个Job的配置
		Configuration conf = new Configuration();
		
		//保证输出目录不存在
		FileSystem fs=FileSystem.get(conf);
		
		if (fs.exists(outputPath)) {
			
			fs.delete(outputPath, true);
			
		}
		
		// ①创建Job
		Job job = Job.getInstance(conf);
		
		// ②设置Job
		// 设置Job运行的Mapper，Reducer类型，Mapper,Reducer输出的key-value类型
		job.setMapperClass(FlowBeanMapper.class);
		job.setReducerClass(FlowBeanReducer.class);
		
		// Job需要根据Mapper和Reducer输出的Key-value类型准备序列化器，通过序列化器对输出的key-value进行序列化和反序列化
		// 如果Mapper和Reducer输出的Key-value类型一致，直接设置Job最终的输出类型
		
		job.setMapOutputKeyClass(FlowBean.class);
		job.setMapOutputValueClass(Text.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		// 设置输入目录和输出目录
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		job.setSortComparatorClass(MyRawComparator.class);
		
		// ③运行Job
		job.waitForCompletion(true);
		
		
	}

MyRawComparator

public class MyRawComparator implements RawComparator<FlowBean>{
	
	private FlowBean key1=new FlowBean();
	private FlowBean key2=new FlowBean();
	private  DataInputBuffer buffer=new DataInputBuffer();

	
	
	// 负责从缓冲区中解析出要比较的两个key对象，调用 compare(Object o1, Object o2)对两个key进行对比
	@Override
	public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
		
		try {
		      buffer.reset(b1, s1, l1);                   // parse key1
		      key1.readFields(buffer);
		      
		      buffer.reset(b2, s2, l2);                   // parse key2
		      key2.readFields(buffer);
		      
		      buffer.reset(null, 0, 0);                   // clean up reference
		    } catch (IOException e) {
		      throw new RuntimeException(e);
		    }
		
		return compare(key1, key2);
	}

	// Comparable的compare(),实现最终的比较
	@Override
	public int compare(FlowBean o1, FlowBean o2) {
		return -o1.getSumFlow().compareTo(o2.getSumFlow());
	}

}