背景
MapReduce作为分布式计算框架,一般情况下要依赖于HDFS在linux环境下运行,打包运行成本高,但我们开发时往往在Windows环境下,有没有一种方式在Windows上像调试普通程序一样调试MR程序呢?
RunJar是Hadoop提供的工具包,专门用于运行jar文件的程序,结合Maven工具,可以实现在Windows环境下调试MR程序的目的。
调试方法(以WordCount为例)
1.使用Maven构成MR工程
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hikdata</groupId>
<artifactId>mapreduce-test</artifactId>
<version>1.0.0</version>
<properties>
<hadoop.version>2.7.3</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
</project>
- 编程MR程序
package com.hikdata;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* 经典WordCount实例
*
*/
public class WordCountApp {
static class WordMapper extends Mapper<Object, Text, Text, IntWritable> {
@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
for (String w : words) {
context.write(new Text(w), new IntWritable(1));
}
}
}
static class WordReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int count = 0;
for (IntWritable w : values) {
count += w.get();
}
context.write(key, new IntWritable(count));
}
}
public static void main(String[] args)
throws IOException,
ClassNotFoundException,
InterruptedException {
System.out.println(args.length);
if (args.length < AppConstants.PARAMS_MIN_2) {
System.out.println("参数不足");
System.exit(1);
}
String inputPath = args[0];
String outputPath = args[1];
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJobName("word count");
job.setJarByClass(WordCountApp.class);
job.setMapperClass(WordMapper.class);
job.setReducerClass(WordReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.waitForCompletion(true);
}
}
代码结构如下图
3.将程序打成Jar包
在IDEA工具中将项目按如下配置打成Jar包,注意排除Hadoop相关依赖包。
4.编写启动程序
使用RunJar类启动MR应用,RunJar参数列表见程序注释。input路径为输入路径,在其中放入一些文本文件;output路径为程序输出路径。
import org.apache.hadoop.util.RunJar;
/**
* Windows中使用RunJar工具调试MapReduce程序
*/
public class Starter {
public static void main(String[] args) throws Throwable {
/**
* RunJar接收参数
* [0]: 必选,要执行的jar位置
* [1]: 必选,要执行的程序主类
* [...] 可选,其他参数供主类解析
*/
String[] parmas = new String[]{
"E:/project/mapreduce-test/mapreduce-test.jar",
"com.hikdata.WordCountApp",
"input/*",
"output/"
};
RunJar.main(parmas);
}
}
5.启动调试
可能遇到的问题
1.输出路径已经存在
Exception in thread "main" org.apache.hadoop.mapred.FileAlreadyExistsException:
Output directory file:.../output already exists
解决方法:删除output目录
2.无权限读写
Exception in thread "main" java.io.IOException:
(null) entry in command string: null chmod 0700
E:\tmp\hadoop-Administrator\mapred\staging\
错误原因:Windows下MR程序没有读写文件的权限,需要hadoop相关库文件支持
解决方案:从https://github.com/4ttty/winutils下载对应版本的hadoop.dll,拷贝到windows的C:\Windows\System32目录