【发布时间】:2017-05-14 15:41:23
【问题描述】:
我正在完成一项任务并学习使用 Hadoop。我正在尝试使用 Wikipedia 页面,但在尝试设置输入格式时不断出现错误。这是我的代码:
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import edu.umd.cloud9.collection.wikipedia.WikipediaPage;
import edu.umd.cloud9.collection.wikipedia.WikipediaPageInputFormat;
public class InvertedIndex {
public static class InvertedIndexMapper extends MapReduceBase
implements Mapper<LongWritable, WikipediaPage, Text, Text> {
private Text title = new Text();
private Text word = new Text();
public void map(LongWritable key, WikipediaPage value,
OutputCollector<Text, Text> output, Reporter report)
throws IOException {
String articleTitle = value.getTitle();
title.set(articleTitle);
String content = value.getContent();
String[] words = content.split(" ");
for (String s : words) {
word.set(s);
output.collect(word, title);
}
}
}
public static class InvertedIndexReducer extends MapReduceBase
implements Reducer<Text, Text, Text, Text> {
public void reduce (Text key, Iterator<Text> values,
OutputCollector<Text, Text> output, Reporter report)
throws IOException {
Set<String> articlesSet = new HashSet<String>();
Text articleNames = new Text();
while (values.hasNext()) {
articlesSet.add(values.toString());
}
String names = "";
for (String s : articlesSet) {
names += s + ", ";
}
articleNames.set(names);
output.collect(key, articleNames);
}
}
public static void main (String[] args) throws Exception {
JobConf conf = new JobConf(InvertedIndex.class);
conf.setJobName("InvertedIndex");
conf.setInputFormat(WikipediaPageInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
conf.setMapperClass(InvertedIndexMapper.class);
conf.setReducerClass(InvertedIndexReducer.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
long startTime = System.currentTimeMillis();
JobClient.runJob(conf);
System.out.println("Job finished in :" + (System.currentTimeMillis() - startTime) / 1000 + " seconds");
}
}
我正在使用 eclipse 并在 conf.setInputFormat(WikipediaPageInputFormat.class); 行得到以下错误;
JobConf 类型中的方法 setInputFormat (Class ? extends InputFormat) 不适用于参数(Class WikipediaPageInputFormat)
当我尝试编译时,出现以下错误:
InvertedIndex.java:81: 找不到符号
符号:方法 setInputFormatClass(java.lang.Class edu.umd.cloud9.collection.wikipedia.WikipediaPageInputFormat)
位置:类 org.apache.hadoop.mapred.JobConf conf.setInputFormatClass(WikipediaPageInputFormat.class);
如果有人以前遇到过这种情况,我将不胜感激有关如何纠正此问题的任何帮助或建议。
谢谢!
【问题讨论】:
标签: hadoop