hadoop word count

最后发布时间 : 2023-10-30 21:24:11 浏览量 :

学习资料

测试hdsf使用正常

hdfs dfs -ls  /

echo "dog dog cat" > input.txt
hdfs dfs -rm /user/input.txt 

hdfs dfs -put input.txt /user
hdfs dfs -ls /user
hadoop jar  hadoop-mapreduce-examples-3.3.6.jar wordcount   /user/input.txt /user/output

hadoop jar  hadoop-mapreduce-examples-3.3.6.jar pi 2 10

测试本地文件系统

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Main {
    public static void main(String[] args) throws Exception {

        Job job = new Job();
        job.setJobName("word count");
        job.setMapperClass(TokenizerMapper.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path("/home/wangyang/workspace/hadoop/hadoop-demo/workDir/input.txt"));
        FileOutputFormat.setOutputPath(job, new Path("/home/wangyang/workspace/hadoop/hadoop-demo/workDir/output"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

.
├── input.txt
└── output
    ├── part-r-00000
    └── _SUCCESS

文件part-r-00000的内容如下：

a	4
v	2

测试HDFS文件系统

public class Main {
    public static void main(String[] args) throws Exception {

        Job job = new Job();
        job.setJobName("word count");
        job.setMapperClass(TokenizerMapper.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path("hdfs://server:8020/user/input.txt"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://server:8020/user/output3"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

提交到Yarn

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobSubmit {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        //1. 设置job运行时需要访问的默认文件系统
        conf.set("fs.defaultFS", "hdfs://server:8020");
        //2. 设置job提交到哪里去
        conf.set("mapreduce.framework.name", "yarn");
        conf.set("yarn.resourcemanager.hostname", "server");

        Job job = new Job(conf);
        job.setJobName("word count");
        job.setMapperClass(TokenizerMapper.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
//        FileInputFormat.addInputPath(job, new Path("hdfs://server:8020/user/input.txt"));
//        FileOutputFormat.setOutputPath(job, new Path("hdfs://server:8020/user/output5"));

        FileInputFormat.addInputPath(job, new Path("/user/input.txt"));
        FileOutputFormat.setOutputPath(job, new Path("/user/output6"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

存在的问题

使用hdfs文件系统时， Server IPC version 9 cannot communicate with client version 4
Java初始化hdfs client时出现的，原因是maven依赖的版本和hdfs的版本不一致
maven仓库中的hadoop-core最高只有1.2.1版本，不能加这个配置，我在测试的时候装的本地hadoop版本是hadoop-3.3.6，因此使用hadoop-common + hadoop-hdfs + hadoop-client的3.3.6版本即可

将下面的依赖替换

    <dependencies>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-core</artifactId>
            <version>1.2.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>3.3.4</version>
        </dependency>
    </dependencies>

为

<properties>
	<hadoop.version>3.3.6</hadoop.version>
</properties>
 <dependencies>
	<dependency>
		<groupId>org.apache.hadoop</groupId>
		<artifactId>hadoop-client</artifactId>
		<version>3.3.6</version>
	</dependency>
	
	
</dependencies>

hdfs MapReduce工作机制