Hadoop学习（十二）MapReduce的案列（对一个用户流量的使用量作分析并排序和自定义分区）

本文链接：https://blog.csdn.net/qq_41848006/article/details/86634113

本文介绍了一个使用Hadoop进行数据全局排序并结合自定义分区策略的案例。通过自定义Mapper、Reducer、Partitioner及FlowBean类，实现了基于流量数据的排序与分区，有效地提升了大规模数据处理的效率。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

接着上一篇博客，我们对其数据进行全局排序和自定义分区

1.mapper类

package com.tiger.FlowSortMapper;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import com.tiger.FlowBean.FlowBean;
/**
 * @author tiger
 *@version1.0
 */
public class FlowSortMapper extends Mapper<LongWritable, Text, FlowBean, Text>{
	@Override
	protected void map(LongWritable key, Text value,Context context)
			throws IOException, InterruptedException {
		String string = value.toString();
		String[] split = string.split("\t");
		long upFlow=Long.parseLong(split[1]);
		long dwFlow=Long.parseLong(split[2]);
		context.write(new FlowBean(upFlow, dwFlow), new Text(split[0]));
	}
}

2.flowbean

package com.tiger.FlowBean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class FlowBean implements WritableComparable<FlowBean>{
	private long upFlow;
	private long dwFlow;
	private long sum;
	
	public FlowBean() {}

	public FlowBean(long upFlow, long dwFlow) {
		this.upFlow = upFlow;
		this.dwFlow = dwFlow;
		sum=upFlow+dwFlow;
	}
	
	public long getUpFlow() {
		return upFlow;
	}

	public void setUpFlow(long upFlow) {
		this.upFlow = upFlow;
	}

	public long getDwFlow() {
		return dwFlow;
	}

	public void setDwFlow(long dwFlow) {
		this.dwFlow = dwFlow;
	}

	public long getSum() {
		return sum;
	}

	public void setSum(long sum) {
		this.sum = sum;
	}

	@Override//序列化
	public void write(DataOutput out) throws IOException {
		out.writeLong(upFlow);
		out.writeLong(dwFlow);
		out.writeLong(sum);
	}

	@Override//反序列化
	public void readFields(DataInput in) throws IOException {
		upFlow=in.readLong();
		dwFlow=in.readLong();
		sum=in.readLong();
		
	}
	@Override
	public String toString() {
		
		return upFlow +"\t"+dwFlow+"\t"+sum;
	}
	@Override//排序
	public int compareTo(FlowBean o) {
		return this.sum>o.getSum()?1:-1;
	}

}

3.reducer

package com.tiger.FlowSortReducer;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import com.tiger.FlowBean.FlowBean;

public class FlowSortReducer extends Reducer<FlowBean, Text, Text, FlowBean>{
	@Override
	protected void reduce(FlowBean key, Iterable<Text> value, Context context)
			throws IOException, InterruptedException {
		context.write(value.iterator().next(), key);
	}
}

4.partitioner

package FlowSortPartitioner;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

import com.tiger.FlowBean.FlowBean;

public class FlowSortPartitioner extends Partitioner<FlowBean, Text> {

	@Override
	public int getPartition(FlowBean key, Text value, int numPartitions) {
		String substring = value.toString().substring(0, 3);
		int partitioner=4;
		if("135".equals(substring))
			return 0;
		else if ("136".equals(substring)) {
			return 1;
		}
		else if ("137".equals(substring)) {
			return 2;
		}
		else if ("138".equals(substring)) {
			return 3;
		}
		return partitioner;
	}

}

5.diver

package com.tiger.FlowSortDriver;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.tiger.FlowBean.FlowBean;
import com.tiger.FlowSortMapper.FlowSortMapper;
import com.tiger.FlowSortReducer.FlowSortReducer;
import FlowSortPartitioner.FlowSortPartitioner;

public class FlowSortDriver {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration configuration=new Configuration();
		Job job=Job.getInstance(configuration);
		job.setJarByClass(FlowSortDriver.class);
		job.setMapperClass(FlowSortMapper.class);
		job.setReducerClass(FlowSortReducer.class);
		job.setMapOutputKeyClass(FlowBean.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		job.setPartitionerClass(FlowSortPartitioner.class);   //导入分区类
		job.setNumReduceTasks(5);  //设置reducer task的数量（大于分区数）
		FileInputFormat.setInputPaths(job, new Path("f://fc//out"));
		FileOutputFormat.setOutputPath(job, new Path("f://fc/out1"));
		boolean waitForCompletion = job.waitForCompletion(true);
		System.out.println(waitForCompletion);	
	}		
}

6.排序结果

13480253104	120	1320	1440
13502468823	735	11349	12084
13510439658	1116	954	2070
13560436326	1136	94	1230
13560436666	1136	94	1230
13560439658	918	4938	5856
13602846565	198	910	1108
13660577991	660	690	1350
13719199419	240	0	240
13726130503	299	681	980
13726238888	2481	24681	27162
13760778710	120	120	240
13822544101	264	0	264
13884138413	4116	1432	5548
13922314466	3008	3720	6728
13925057413	11058	4243	15301
13926251106	240	0	240
13926435656	132	1512	1644
15013685858	369	338	707
15889002119	938	380	1318
15920133257	316	296	612
18212575961	1527	2106	3633
18320173382	9531	212	9743

7.排序加分区的结果

分区一：

13560436666	1136	94	1230
13560436326	1136	94	1230
13510439658	1116	954	2070
13560439658	918	4938	5856
13502468823	735	11349	12084

分区二：

13602846565	198	910	1108
13660577991	660	690	1350

分区三：

13760778710	120	120	240
13719199419	240	0	240
13726130503	299	681	980
13726238888	2481	24681	27162

分区四：

13822544101	264	0	264
13884138413	4116	1432	5548

分区五：

13926251106	240	0	240
15920133257	316	296	612
15013685858	369	338	707
15889002119	938	380	1318
13480253104	120	1320	1440
13926435656	132	1512	1644
18212575961	1527	2106	3633
13922314466	3008	3720	6728
18320173382	9531	212	9743
13925057413	11058	4243	15301