Skip to content

Commit 482d747

Browse files
added word count example classes
1 parent 048d6a5 commit 482d747

File tree

5 files changed

+51
-29
lines changed

5 files changed

+51
-29
lines changed

project4/.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/bin/
2+
/target/
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
eclipse.preferences.version=1
2+
encoding//src/main/java=UTF-8
3+
encoding//src/main/resources=UTF-8
4+
encoding/<project>=UTF-8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package com.jobreadyprogrammer.mappers;
2+
3+
import java.util.Arrays;
4+
import java.util.Iterator;
5+
6+
import org.apache.spark.api.java.function.FlatMapFunction;
7+
import org.apache.spark.sql.Row;
8+
9+
public class LineMapper implements FlatMapFunction<Row, String> {
10+
11+
private static final long serialVersionUID = 1L;
12+
13+
@Override
14+
public Iterator<String> call(Row value) throws Exception {
15+
16+
return Arrays.asList(value.toString().split(" ")).iterator();
17+
18+
}
19+
20+
}

project4/src/main/java/com/jobreadyprogrammer/pojos/Line.java

+2-8
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,8 @@
11
package com.jobreadyprogrammer.pojos;
22

3-
import java.io.Serializable;
4-
5-
public class Line implements Serializable {
3+
public class Line {
64

7-
/**
8-
*
9-
*/
10-
private static final long serialVersionUID = 1L;
11-
String[] words;
5+
private static String[] words;
126

137
public String[] getWords() {
148
return words;

project4/src/main/java/com/jobreadyprogrammer/spark/WordCount.java

+23-21
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
package com.jobreadyprogrammer.spark;
22

3-
import org.apache.spark.api.java.function.MapFunction;
43
import org.apache.spark.sql.Dataset;
54
import org.apache.spark.sql.Encoders;
65
import org.apache.spark.sql.Row;
76
import org.apache.spark.sql.SparkSession;
87

9-
import com.jobreadyprogrammer.pojos.Line;
8+
import com.jobreadyprogrammer.mappers.LineMapper;
9+
10+
import breeze.linalg.Options.Value;
1011

1112
public class WordCount {
1213

@@ -25,26 +26,27 @@ public void start() {
2526
df.show(5);
2627
df.printSchema();
2728

28-
Dataset<Line> houseDS = df.map(
29-
new MapFunction<Row, Line>(){
30-
31-
private static final long serialVersionUID = -2L;
32-
33-
@Override
34-
public Line call(Row value) throws Exception {
35-
String[] words = value.toString().split(" ");
36-
Line l = new Line();
37-
l.setWords(words);
38-
39-
return l;
40-
}
41-
42-
},
43-
44-
Encoders.bean(Line.class));
29+
Dataset<String> lineDS = df.flatMap(
30+
new LineMapper(), Encoders.STRING());
31+
4532

46-
houseDS.printSchema();
47-
houseDS.show(10, 50);
33+
lineDS.printSchema();
34+
lineDS.show(10, 200);
35+
36+
String boringWords = "( 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by',\r\n" +
37+
" 'for', 'if', 'in', 'into', 'is', 'it',\r\n" +
38+
" 'no', 'not', 'of', 'on', 'or', 'such',\r\n" +
39+
" 'that', 'the', 'their', 'then', 'there', 'these',\r\n" +
40+
" 'they', 'this', 'to', 'was', 'will', 'with', 'he', 'she')";
41+
42+
Dataset<Row> df2 = lineDS.toDF();
43+
df2 = df2.groupBy("value").count();
44+
df2 = df2.filter("lower(value) NOT IN" + boringWords);
45+
df2 = df2.orderBy(df2.col("count").desc());
46+
47+
48+
df2.printSchema();
49+
df2.show(100);
4850
}
4951

5052

0 commit comments

Comments
 (0)