hadoop词频统计

hadoop词频统计 – wordcount

统计wc.txt中的词分别出现的次数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
package wordCount;
/*
统计wc.txt中词出现的次数
*/

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


public class wordcount {
//map端
public static class MapTask extends Mapper<LongWritable, Text,Text, IntWritable> {
@Override
//(hadoop,hadoop,spark,spark) ==> (hadoop,1) (hadoop,1) (spark,1) (spark,1)
//每读一行数据,执行一次下面代码
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] words = value.toString().split(",");
for (String word : words) {
//将结果写出去
context.write(new Text(word),new IntWritable(1));
}
}
}
//reduce端
public static class ReduceTask extends Reducer<Text, IntWritable,Text, IntWritable>{
@Override
//(hadoop,(1,1,1,1....)) (spark,(1,1,1,1....)) ==> (hadoop,n) (spark,n)
//每读一行数据,执行一次下面代码
//map端的输出就是reduce端的输入
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//统计单词出现的次数
int count=0;
for (IntWritable value : values) {
count++;
}
//将结果写出去
context.write(key,new IntWritable(count));
}
}
public static void main(String[] args) throws Exception {
//在提交到集群的时候 注意使用root用户
System.setProperty("HADOOP_USER_NAME","root");
//接下来 需要提交任务,我们需要创建一个JOB对象,这个哥们是专门负责提交MR任务的
//我们要把这个MR程序 提交给集群去运行
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://192.168.120.110:9000");
Job job = Job.getInstance(conf);

//把map端与reduce端的俩个静态类提交给JOB
job.setMapperClass(wordcount.MapTask.class);
job.setReducerClass(wordcount.ReduceTask.class);
job.setJarByClass(wordcount.class);

//在告诉job四个输出参数的类型 注意:这里只要输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

//判断 如果输出目录存在,我们就删除
String outPath="/bigdata/output/wordcount";
/* File file = new File(outPath);
if(file.exists()){
FileUtils.deleteDirectory(file);
}*/
FileSystem fileSystem = FileSystem.get(conf);
if(fileSystem.exists(new Path(outPath))){
fileSystem.delete(new Path(outPath),true);
}

//告诉job输入 输出的路径
FileInputFormat.addInputPath(job,new Path("/bigdata/input/wc.txt"));
FileOutputFormat.setOutputPath(job,new Path("/bigdata/output/wordcount"));

//温馨提示
boolean b = job.waitForCompletion(true);
System.out.println(b?"代码没毛病!!!":"出BUG,赶快看一下!!!");
}
}

-------------本文结束感谢您的阅读-------------