hadoop word count
最后发布时间 : 2023-10-30 21:24:11
浏览量 :
学习资料
- 使用 VSCode 编写 WordCount
- Hadoop web: http://localhost:9870/
- Yarn: http://localhost:8088/cluster
- https://github.com/bioproj/hadoop-demo
测试hdsf使用正常
hdfs dfs -ls /
echo "dog dog cat" > input.txt
hdfs dfs -rm /user/input.txt
hdfs dfs -put input.txt /user
hdfs dfs -ls /user
hadoop jar hadoop-mapreduce-examples-3.3.6.jar wordcount /user/input.txt /user/output
hadoop jar hadoop-mapreduce-examples-3.3.6.jar pi 2 10
测试本地文件系统
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Main {
public static void main(String[] args) throws Exception {
Job job = new Job();
job.setJobName("word count");
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path("/home/wangyang/workspace/hadoop/hadoop-demo/workDir/input.txt"));
FileOutputFormat.setOutputPath(job, new Path("/home/wangyang/workspace/hadoop/hadoop-demo/workDir/output"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
.
├── input.txt
└── output
├── part-r-00000
└── _SUCCESS
文件part-r-00000
的内容如下:
a 4
v 2
测试HDFS文件系统
public class Main {
public static void main(String[] args) throws Exception {
Job job = new Job();
job.setJobName("word count");
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path("hdfs://server:8020/user/input.txt"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://server:8020/user/output3"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
提交到Yarn
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmit {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//1. 设置job运行时需要访问的默认文件系统
conf.set("fs.defaultFS", "hdfs://server:8020");
//2. 设置job提交到哪里去
conf.set("mapreduce.framework.name", "yarn");
conf.set("yarn.resourcemanager.hostname", "server");
Job job = new Job(conf);
job.setJobName("word count");
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// FileInputFormat.addInputPath(job, new Path("hdfs://server:8020/user/input.txt"));
// FileOutputFormat.setOutputPath(job, new Path("hdfs://server:8020/user/output5"));
FileInputFormat.addInputPath(job, new Path("/user/input.txt"));
FileOutputFormat.setOutputPath(job, new Path("/user/output6"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
存在的问题
使用hdfs文件系统时, Server IPC version 9 cannot communicate with client version 4
Java初始化hdfs client时出现的,原因是maven依赖的版本和hdfs的版本不一致
maven仓库中的hadoop-core最高只有1.2.1版本, 不能加这个配置,我在测试的时候装的本地hadoop版本是hadoop-3.3.6,因此使用hadoop-common + hadoop-hdfs + hadoop-client的3.3.6版本即可
将下面的依赖替换
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.3.4</version>
</dependency>
</dependencies>
为
<properties>
<hadoop.version>3.3.6</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.6</version>
</dependency>
</dependencies>