join案例
两张表:
Province:----><1,a#beijing>
1 beijing
2 hangzhou
3 shaoxing
4 wenzhou
5 tianjing
6 taizhou
7 henan
8 wuhan
Data
1 2010 1964<1, b#2010 1962> <1,b#2010,2399>
1 2010 2399
2 2011 1398
4 2011 4444
4 2010 3452
9 2010 2341
9 2011 3442 //无效数据
------><1,beijing 2010 1962> <1,beijing 2010 2399>
如果不打标记 b 和b会进行合并
只有a和b可以进行合并
需求:对两张表进行连接 过滤无效的数据
1 beijing 2010 1962
1 beijing 2010 2399
编程思路
1.map的过程:数据分片来自于哪个文件—>标记,为了后面的合并
map的输出:<1,a#beijing b#2010 1962 b#2010 2399>
2.reduce 让a#和b#合并
packagemapredeuce;
importorg.apache.hadoop.conf.Configuration;
importorg.apache.hadoop.fs.Path;
importorg.apache.hadoop.io.LongWritable;
importorg.apache.hadoop.io.Text;
importorg.apache.hadoop.mapreduce.Job;
importorg.apache.hadoop.mapreduce.Mapper;
importorg.apache.hadoop.mapreduce.Reducer;
importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
importorg.apache.hadoop.mapreduce.lib.input.FileSplit;
importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
importjava.io.IOException;
importjava.util.Vector;
publicclassjoin{
publicstaticvoidmain(String[]args)throwsIOException,InterruptedException,ClassNotFoundException{
//1.读取配置文件(知道hdfs在哪里了)
Configurationconf=newConfiguration();
//2.创建job
Jobjob=Job.getInstance(conf,"join");
//设置Job运行的主类
job.setJarByClass(wordcount.class);
//3.设置job从哪里读数据怎么处理怎么输出
//input
PathinputPath=newPath(args[0]);
FileInputFormat.setInputPaths(job,inputPath);//读取文件规则格式化
//map
job.setMapperClass(join.mapper.class);//map用哪个类处理null
job.setMapOutputKeyClass(Text.class);//先是null
job.setMapOutputValueClass(Text.class);//null
//补充
//shuffle
//reduce
job.setReducerClass(join.reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//output
PathoutputPath=newPath(args[1]);
FileOutputFormat.setOutputPath(job,outputPath);
//任务提交
booleanisSuccess=job.waitForCompletion(true);
//成功0否则就是1
System.exit(isSuccess?0:1);
}
publicstaticclassmapperextendsMapper<LongWritable,Text,Text,Text>{
privatefinalstaticTextKEY=newText();
privatefinalstaticTextVALUE=newText();
@Override
protectedvoidmap(LongWritablekey,Textvalue,Contextcontext)throwsIOException,InterruptedException{
//对文件进行分割获取文件名来区分两张表
FileSplitfileSplit=(FileSplit)context.getInputSplit();
//承接分割出来的信息
Stringoutkey="";
Stringoutvalue="";
//获取读取文件的路径,便于判断后续再values前面加a#b#
Stringpath=fileSplit.getPath().toString();
//将value的内容转换成string类型//包括1beijingLongWritable代表的是行偏移量
Stringline=value.toString();
//排错:如果文件为空直接返回
if(line==null||line.equals("")){
return;
}
//如果文件不为空,判断目前读取什么文件
if(path.contains("provinces")){
String[]values=line.split("");
//如果该行不为两列,说明是无效数据
if(values.length!=2){
return;
}
outkey=values[0];
outvalue="a#"+values[1];
}
elseif(path.contains("datas")){
String[]values=line.split("");
//如果改行不为3列,说明是无效数据
if(values.length!=3){
return;
}
outkey=values[0];
outvalue="b#"+values[1]+""+values[2];
}
//outkey和outvalue同时存在的时候
if(!outkey.equals("")&&!outvalue.equals("")){
KEY.set(outkey);
VALUE.set(outvalue);
context.write(KEY,VALUE);
}
}
}
/**map输出<1a#beijing>
*<1b#20101962>
*<1b#20112019>
*reduce输入TEXT,TEXT
*reduce输出<1beijing20101062>
*<1beijing20112019>
*TextText
*/
publicstaticclassreducerextendsReducer<Text,Text,Text,Text>{
@Override
protectedvoidreduce(Textkey,Iterable<Text>values,Contextcontext)throwsIOException,InterruptedException{
//定义两个容器A装Provinces--->a#B装datas----->b#
Vector<String>A=newVector<String>();
Vector<String>B=newVector<String>();
for(Textval:values){
if(val.toString().startsWith("a#")){
A.add(val.toString().substring(2));
}
elseif(val.toString().startsWith("b#")){
B.add(val.toString().substring(2));
}
}
//确定容器的层数---->要遍历的次数
intsizeA=A.size();
intsizeB=B.size();
//定义变量来接收合并的values
Stringstr="";
for(inti=0;i<sizeA;i++){
for(intj=0;j<sizeB;j++){
str=A.get(i)+""+B.get(j);
context.write(key,newText(str));
}
}
}
}
}