自定义RecordReader

本文介绍了如何根据需求自定义Hadoop的RecordReader,以实现对输入数据的奇偶行进行统计。通过创建自定义的InputFormat、RecordReader、Partitioner和Mapper、Reducer,实现了对文件中奇数行和偶数行的独立处理。实验结果显示,计算结果正确,展示了自定义RecordReader在满足特定数据处理需求时的重要性。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

自定义RecordReader

Hadoop默认的InputFormatTextInputFormat,对应的数据解析器默认为LineRecordReader
我们可以根据需要自定义InputFormatRecordReader来个性化对输入的处理。

下面这个例子是我学习过程中参考视频教程做的一个练习,查了很多资料,大概弄懂,满心欢喜,兴致勃勃,欲记之,研墨毕,惊觉早有此例之详述,吾至而立之年以来,渐得一习:凡所经苦思冥想之事,必记之,以为缅。遂得此文,不究雷同与否。

实验环境
操作系统: Ubuntu 16.04 LTS
Hadoop版本: Apache Hadoop2.6.5
JDK版本: JDK1.7
集群配置: 伪分布式模式


问题描述

需求:对如下文件,分别统计奇数行和偶数行总和

number


问题分析

问题的难点在于:
我们如何区分读入的数据是奇数行还是偶数行

Hadoop默认的InputFormat处理类为TextInputFormat,将数据分片对应的数据读入,划分为 <offset,text> 这样的形式 , 对此例中的要求,我们可以通过指定自定义InputFormat子类来实现对原始数据进行自定义的处理规则。

这里采用的方法是,通过自定义的InputFormat,读取记录时记录当前行号line_number,将number.txt中的数据转化成<line_number,text> 这样的形式,根据line_number 确定奇偶行,对map的输出进行partitioner操作,对应到处理奇数行之和与偶数行之和的reducer中。

注:在这个例子中,不对数据文件进行分片

编码

MyInputFormat.java
自定义的InputFormat,用自定义的RecordReader对象读入分片对应的数据 , 不允许文件分片

package mr;

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

//这里从文件读取分片,继承FileInputFormat类
public class MyInputFormat extends FileInputFormat<LongWritable, Text> {
   
   

    // 自定义的RecordReader,负责解析分片对应的数据
    private MyRecordReader myRecordReader=null;

    //RecordReader负责处理分片对应的数据
    @Override
    public RecordReader<LongWritable, Text> createRecordReader(InputSplit inputSplit, TaskAttemptContext context)
            throws IOException, InterruptedException {

        myRecordReader=nwe MyRecordReader(inputSplit,context);

        //初始化自定义的RecordReader对象
        myRecordReader.initialize();

        return myRecordReader;
    }

    //是否可分割文件,在这个例子中,不需要对输入进行分片,直接返回false
    @Override
    protected boolean isSplitable(J
package BeiKe; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.json.JSONArray; import org.json.JSONObject; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; public class CleanHouse { // 自定义InputFormat处理完整JSON数组 public static class WholeFileInputFormat extends FileInputFormat<Text, Text> { @Override public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context) { return new WholeFileRecordReader(); } @Override protected boolean isSplitable(JobContext context, Path file) { return false; // 禁止文件分割 } } // 自定义RecordReader读取整个文件 public static class WholeFileRecordReader extends RecordReader<Text, Text> { private Text key = new Text(); private Text value = new Text(); private boolean processed = false; @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(context.getConfiguration()); try (FSDataInputStream in = fs.open(path); BufferedReader reader = new BufferedReader(new InputStreamReader(in))) { StringBuilder sb = new StringBuilder(); String line; while ((line = reader.readLine()) != null) { sb.append(line); } value.set(sb.toString()); } } @Override public boolean nextKeyValue() { if (!processed) { key.set("json_array"); processed = true; return true; } return false; } @Override public Text getCurrentKey() { return key; } @Override public Text getCurrentValue() { return value; } @Override public float getProgress() { return processed ? 1.0f : 0.0f; } @Override public void close() {} } // Mapper类处理JSON数组 public static class JsonMapper extends Mapper<Text, Text, Text, Text> { @Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { try { String jsonString = value.toString().trim(); JSONArray jsonArray = new JSONArray(jsonString); for (int i = 0; i < jsonArray.length(); i++) { JSONObject json = jsonArray.getJSONObject(i); // 处理单价字段 - 增强过滤条件 String price = json.optString("单价(元/平方米)", ""); if (price == null || price.isEmpty() || "暂无信息".equals(price) || "无".equals(price) || "NaN".equals(price) || "NaN元/平方米".equals(price)) { continue; } // 移除"元/平方米"后缀并转换为数字 double priceValue = parsePrice(price); if (Double.isNaN(priceValue)) { // 确保所有NaN被过滤 continue; } // 更新单价字段值为纯数字 json.put("单价(元/平方米)", priceValue); // 输出有效记录 - 使用固定键确保所有数据进入同一个Reducer context.write(new Text("all_records"), new Text(json.toString())); } } catch (Exception e) { System.err.println("JSON处理错误: " + e.getMessage()); System.err.println("错误数据: " + value.toString()); } } // 增强价格解析方法 private double parsePrice(String priceStr) { try { // 移除"元/平方米"后缀 String cleaned = priceStr.replace("元/平方米", "").trim(); // 处理特殊NaN情况 if ("NaN".equals(cleaned) || "暂无信息".equals(cleaned) || "无".equals(cleaned)) { return Double.NaN; } // 尝试解析为数字 return Double.parseDouble(cleaned); } catch (NumberFormatException e) { System.err.println("价格解析错误: " + priceStr); return Double.NaN; } } } // Reducer类构建完整JSON数组 public static class JsonReducer extends Reducer<Text, Text, Text, Text> { // 定义固定的字段顺序 private static final String[] FIELD_ORDER = { "地点", "单价(元/平方米)", "绿化率", "开发商", "建成年代" }; @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { // 收集所有JSON对象 List<JSONObject> jsonObjects = new ArrayList<>(); for (Text value : values) { jsonObjects.add(new JSONObject(value.toString())); } // 构建完整JSON数组 StringBuilder result = new StringBuilder("[\n"); for (int i = 0; i < jsonObjects.size(); i++) { JSONObject obj = jsonObjects.get(i); result.append(" {\n"); // 按照固定顺序输出字段 for (int j = 0; j < FIELD_ORDER.length; j++) { String field = FIELD_ORDER[j]; if (!obj.has(field)) continue; result.append(" \"").append(field).append("\": "); Object val = obj.get(field); if (val instanceof String) { result.append("\"").append(val).append("\""); } else { result.append(val); } // 添加逗号分隔符(最后一个字段不加) if (j < FIELD_ORDER.length - 1) { result.append(","); } result.append("\n"); } result.append(" }"); if (i < jsonObjects.size() - 1) { result.append(","); } result.append("\n"); } result.append("]"); // 输出完整JSON数组 context.write(null, new Text(result.toString())); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "JSON Array Processor"); job.setJarByClass(CleanHouse.class); job.setInputFormatClass(WholeFileInputFormat.class); job.setMapperClass(JsonMapper.class); job.setReducerClass(JsonReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }和package BeiKe; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.json.JSONArray; import org.json.JSONObject; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.HashSet; public class CleanHouse1 { // 自定义InputFormat处理完整JSON数组 public static class WholeFileInputFormat extends FileInputFormat<Text, Text> { @Override public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context) { return new WholeFileRecordReader(); } @Override protected boolean isSplitable(JobContext context, Path file) { return false; // 禁止文件分割 } } // 自定义RecordReader读取整个文件 public static class WholeFileRecordReader extends RecordReader<Text, Text> { private Text key = new Text(); private Text value = new Text(); private boolean processed = false; @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(context.getConfiguration()); try (FSDataInputStream in = fs.open(path); BufferedReader reader = new BufferedReader(new InputStreamReader(in))) { StringBuilder sb = new StringBuilder(); String line; while ((line = reader.readLine()) != null) { sb.append(line); } value.set(sb.toString()); } } @Override public boolean nextKeyValue() { if (!processed) { key.set("json_array"); processed = true; return true; } return false; } @Override public Text getCurrentKey() { return key; } @Override public Text getCurrentValue() { return value; } @Override public float getProgress() { return processed ? 1.0f : 0.0f; } @Override public void close() {} } // Mapper类处理JSON数组 public static class JsonMapper extends Mapper<Text, Text, Text, Text> { // 需要检查的关键字段 private static final Set<String> KEY_FIELDS = new HashSet<String>() {{ add("地点"); add("元/平方米"); add("绿化率"); add("开发商"); add("建成年代"); }}; // 无效值集合 private static final Set<String> INVALID_VALUES = new HashSet<String>() {{ add("无"); add("暂无信息"); add("NaN"); }}; // 检查关键字段是否包含无效值 private boolean containsInvalidValue(JSONObject json) { for (String field : KEY_FIELDS) { if (json.has(field)) { Object value = json.get(field); if (value instanceof String) { String strValue = ((String) value).trim(); if (INVALID_VALUES.contains(strValue)) { return true; } } } } return false; } // 处理单价字段并重命名 private void processPriceField(JSONObject json) { String price = json.optString("单价(元/平方米)", ""); if (INVALID_VALUES.contains(price.trim())) { return; // 无效值,跳过记录 } // 重命名字段 json.put("元/平方米", price); json.remove("单价(元/平方米)"); } @Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { try { String jsonString = value.toString().trim(); JSONArray jsonArray = new JSONArray(jsonString); for (int i = 0; i < jsonArray.length(); i++) { JSONObject json = jsonArray.getJSONObject(i); // 处理单价字段 processPriceField(json); // 检查关键字段是否包含无效值 if (containsInvalidValue(json)) { continue; // 跳过包含无效值的记录 } // 输出有效记录 context.write(new Text("all_records"), new Text(json.toString())); } } catch (Exception e) { System.err.println("JSON处理错误: " + e.getMessage()); System.err.println("错误数据: " + value.toString()); } } } // Reducer类构建完整JSON数组 public static class JsonReducer extends Reducer<Text, Text, Text, Text> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { // 收集所有JSON对象 List<JSONObject> jsonObjects = new ArrayList<>(); for (Text value : values) { jsonObjects.add(new JSONObject(value.toString())); } // 构建完整JSON数组 StringBuilder result = new StringBuilder("[\n"); for (int i = 0; i < jsonObjects.size(); i++) { // 格式化JSON对象输出 JSONObject obj = jsonObjects.get(i); result.append(" {\n"); // 添加每个字段并格式化 String[] keys = obj.keySet().toArray(new String[0]); for (int j = 0; j < keys.length; j++) { String field = keys[j]; result.append(" \"").append(field).append("\": "); Object val = obj.get(field); if (val instanceof String) { result.append("\"").append(val).append("\""); } else { result.append(val); } if (j < keys.length - 1) { result.append(","); } result.append("\n"); } result.append(" }"); if (i < jsonObjects.size() - 1) { result.append(","); } result.append("\n"); } result.append("]"); // 输出完整JSON数组 context.write(null, new Text(result.toString())); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "JSON Array Processor"); job.setJarByClass(CleanHouse1.class); job.setInputFormatClass(WholeFileInputFormat.class); job.setMapperClass(JsonMapper.class); job.setReducerClass(JsonReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }俩个代码进行合并,放在一个class里,第一个代码的方法名为ProcessPrice(),第二个代码方法名为ProcessNull()
最新发布
06-14
package BeiKe; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.json.JSONArray; import org.json.JSONObject; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; public class CleanHouse { // 自定义InputFormat处理完整JSON数组 public static class WholeFileInputFormat extends FileInputFormat<Text, Text> { @Override public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context) { return new WholeFileRecordReader(); } @Override protected boolean isSplitable(JobContext context, Path file) { return false; // 禁止文件分割,确保整个JSON数组被完整读取 } } // 自定义RecordReader读取整个文件 public static class WholeFileRecordReader extends RecordReader<Text, Text> { private Text key = new Text(); private Text value = new Text(); private boolean processed = false; @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(context.getConfiguration()); try (FSDataInputStream in = fs.open(path); BufferedReader reader = new BufferedReader(new InputStreamReader(in))) { StringBuilder sb = new StringBuilder(); String line; while ((line = reader.readLine()) != null) { sb.append(line); } value.set(sb.toString()); } } @Override public boolean nextKeyValue() { if (!processed) { key.set("json_array"); processed = true; return true; } return false; } @Override public Text getCurrentKey() { return key; } @Override public Text getCurrentValue() { return value; } @Override public float getProgress() { return processed ? 1.0f : 0.0f; } @Override public void close() {} } // Mapper类处理JSON数组 public static class JsonMapper extends Mapper<Text, Text, Text, Text> { @Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { try { String jsonString = value.toString().trim(); // 移除JSON数组前后的方括号(如果存在) if (jsonString.startsWith("[") && jsonString.endsWith("]")) { jsonString = jsonString.substring(1, jsonString.length() - 1); } // 分割JSON对象(假设对象之间用逗号分隔) String[] jsonObjects = jsonString.split(",(?=\\{)"); for (String obj : jsonObjects) { if (!obj.trim().isEmpty()) { JSONObject json = new JSONObject(obj.trim()); // 处理单价字段 String price = json.optString("单价(元/平方米)", ""); if (price.isEmpty() || "暂无信息".equals(price) || "无".equals(price)) { continue; } // 重命名字段 json.put("元/平方米", json.remove("单价(元/平方米)")); // 输出有效记录 context.write(new Text(json.getString("地点")), new Text(json.toString())); } } } catch (Exception e) { System.err.println("JSON处理错误: " + e.getMessage()); System.err.println("错误数据: " + value.toString()); } } } // Reducer类直接输出结果 public static class JsonReducer extends Reducer<Text, Text, Text, Text> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { context.write(null, value); } } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "JSON Array Processor"); job.setJarByClass(CleanHouse.class); job.setInputFormatClass(WholeFileInputFormat.class); job.setMapperClass(JsonMapper.class); job.setReducerClass(JsonReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }上述代码运行结果如下[root@hadoop01 software]# hadoop jar CleanHouse.jar /贝壳网南通地区房价数据.json /out 25/06/13 13:42:10 INFO client.RMProxy: Connecting to ResourceManager at hadoop01/192.168.164.128:8032 25/06/13 13:42:10 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this. 25/06/13 13:42:13 INFO input.FileInputFormat: Total input paths to process : 1 25/06/13 13:42:13 INFO mapreduce.JobSubmitter: number of splits:1 25/06/13 13:42:13 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1749776424019_0030 25/06/13 13:42:14 INFO impl.YarnClientImpl: Submitted application application_1749776424019_0030 25/06/13 13:42:14 INFO mapreduce.Job: The url to track the job: http://hadoop01:8088/proxy/application_1749776424019_0030/ 25/06/13 13:42:14 INFO mapreduce.Job: Running job: job_1749776424019_0030 25/06/13 13:42:27 INFO mapreduce.Job: Job job_1749776424019_0030 running in uber mode : false 25/06/13 13:42:27 INFO mapreduce.Job: map 0% reduce 0% 25/06/13 13:42:56 INFO mapreduce.Job: map 100% reduce 0% 25/06/13 13:43:13 INFO mapreduce.Job: map 100% reduce 100% 25/06/13 13:43:14 INFO mapreduce.Job: Job job_1749776424019_0030 completed successfully 25/06/13 13:43:14 INFO mapreduce.Job: Counters: 49 File System Counters FILE: Number of bytes read=203 FILE: Number of bytes written=283483 FILE: Number of read operations=0 FILE: Number of large read operations=0 FILE: Number of write operations=0 HDFS: Number of bytes read=475889 HDFS: Number of bytes written=153 HDFS: Number of read operations=6 HDFS: Number of large read operations=0 HDFS: Number of write operations=2 Job Counters Launched map tasks=1 Launched reduce tasks=1 Data-local map tasks=1 Total time spent by all maps in occupied slots (ms)=25723 Total time spent by all reduces in occupied slots (ms)=12371 Total time spent by all map tasks (ms)=25723 Total time spent by all reduce tasks (ms)=12371 Total vcore-milliseconds taken by all map tasks=25723 Total vcore-milliseconds taken by all reduce tasks=12371 Total megabyte-milliseconds taken by all map tasks=26340352 Total megabyte-milliseconds taken by all reduce tasks=12667904 Map-Reduce Framework Map input records=1 Map output records=1 Map output bytes=194 Map output materialized bytes=203 Input split bytes=124 Combine input records=0 Combine output records=0 Reduce input groups=1 Reduce shuffle bytes=203 Reduce input records=1 Reduce output records=1 Spilled Records=2 Shuffled Maps =1 Failed Shuffles=0 Merged Map outputs=1 GC time elapsed (ms)=497 CPU time spent (ms)=3640 Physical memory (bytes) snapshot=433668096 Virtual memory (bytes) snapshot=4200083456 Total committed heap usage (bytes)=274726912 Shuffle Errors BAD_ID=0 CONNECTION=0 IO_ERROR=0 WRONG_LENGTH=0 WRONG_MAP=0 WRONG_REDUCE=0 File Input Format Counters Bytes Read=475765 File Output Format Counters Bytes Written=153 [root@hadoop01 software]# hadoop fs -cat /out/part* {“地点”:“(苏锡通园区) 江海路蓝天花苑”,“元/平方米”:“4867元/平方米”,“绿化率”:0.2,“开发商”:“无”,“建成年代”:“1998-2010年”} ,但json中有有多条数据,且要求输出格式下面类似{ "地点": "(海门区) 通源路147号", "单价(元/平方米)": "9281元/平方米", "绿化率": 0.35, "开发商": "暂无信息", "建成年代": "2001-2015年" }, { "地点": "(海门区) 丝绸东路299号", "单价(元/平方米)": "7716元/平方米", "绿化率": 0.3, "开发商": "沪商置业有限公司", "建成年代": "2013-2020年" }, { "地点": "(海门区) 粉坊弄", "单价(元/平方米)": "8178元/平方米", "绿化率": 0.01, "开发商": "上海中海海昆房地产有限公司", "建成年代": "1993-2008年" }
06-14
### 安装和配置 RecordReader RecordReader 并不是一个独立的组件,因此不存在传统意义上的“安装”。相反,在 Hadoop 的 MapReduce 框架中,`RecordReader` 是通过编程的方式来自定义并集成到作业中的。具体来说,为了使 `RecordReader` 能够按照特定的需求工作,通常需要完成以下几个方面的开发配置: #### 自定义 RecordReader 类 创建一个新的 Java 类继承自 `org.apache.hadoop.mapreduce.RecordReader<K,V>` 抽象类,并实现其所有抽象方法。这一步骤允许开发者指定如何解析输入数据源以及怎样提取键值对[^2]。 ```java public class CustomRecordReader extends RecordReader<LongWritable, Text> { @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // 初始化逻辑... } @Override public boolean nextKeyValue() throws IOException, InterruptedException { // 实现迭代获取下一个 key-value 对的方法... return false; } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { // 返回当前处理的 key ... return null; } @Override public Text getCurrentValue() throws IOException, InterruptedException { // 返回当前处理的 value ... return null; } @Override public float getProgress() throws IOException, InterruptedException { // 记录进度百分比... return 0f; } @Override public void close() throws IOException { // 清理资源... } } ``` #### 创建自定义 InputFormat 类 接着要构建一个实现了 `InputFormat<K,V>` 接口的新类,其中最重要的是覆盖 `createRecordReader()` 方法以返回之前定义好的 `CustomRecordReader` 实例对象。这样做可以确保当执行 MapReduce 任务时会使用该定制化的记录读取器去加载数据[^3]。 ```java public class CustomInputFormat extends FileInputFormat<LongWritable, Text> { @Override protected boolean isSplitable(JobContext context, Path filename) { // 控制是否支持切分文件,默认情况下应考虑实际情况调整此行为。 return true; } @Override public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new CustomRecordReader(); } } ``` #### 配置 Job 使用自定义 RecordReader 最后,在提交给集群运行前,需修改应用程序代码来指明所使用的 `InputFormat` 应为上述新建立的那个版本。可以通过调用 `Job.setJarByClass(Class<?>)` 和 `Job.setInputFormatClass(Class<? extends InputFormat>)` 来达成目的。 ```java Configuration conf = new Configuration(); Job job = Job.getInstance(conf); // 设置其他必要的参数... job.setInputFormatClass(CustomInputFormat.class); System.exit(job.waitForCompletion(true) ? 0 : 1); ``` 以上就是在不同平台(主要是指不同的操作系统环境)上针对 Hadoop 生态系统内 MapReduce 编程模型下 “安装” 或者说是配置 `RecordReader` 所涉及的主要步骤。需要注意的是这些操作都是基于编写适当的应用程序代码来进行的,而不是像常规软件那样存在单独的安装过程。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值