Skip to content

Commit cb87e45

Browse files
init of project 5
1 parent ed4f46d commit cb87e45

File tree

4 files changed

+33
-7
lines changed

4 files changed

+33
-7
lines changed

project5/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc

Whitespace-only changes.

project5/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet

Whitespace-only changes.

project5/src/main/java/com/jobreadyprogrammer/spark/Application.java

+32-5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
import org.apache.spark.sql.Row;
55
import org.apache.spark.sql.SparkSession;
66

7+
import static org.apache.spark.sql.functions.*;
8+
9+
710
public class Application {
811

912
public static void main(String[] args) {
@@ -14,16 +17,40 @@ public static void main(String[] args) {
1417
.getOrCreate();
1518

1619

17-
String filename = "src/main/resources/grade_chart.csv";
20+
String studentsFile = "src/main/resources/students.csv";
1821

19-
Dataset<Row> df = spark.read().format("csv")
22+
Dataset<Row> studentDf = spark.read().format("csv")
2023
.option("inferSchema", "true") // Make sure to use string version of true
2124
.option("header", true)
22-
.load(filename);
25+
.load(studentsFile);
26+
27+
String gradeChartFile = "src/main/resources/grade_chart.csv";
28+
29+
Dataset<Row> gradesDf = spark.read().format("csv")
30+
.option("inferSchema", "true") // Make sure to use string version of true
31+
.option("header", true)
32+
.load(gradeChartFile);
33+
34+
35+
// How to join tables
36+
// Talk about how you can get rid of the df.col() and just use col()
37+
// Talk about using just column names in strings in the select
38+
// Talk about how you can also just use the col() function instead of df.col()
39+
// start with removing df. Then go on to remove the col() as well to show the stripped down version
40+
// Talk about how adding filter after the select limits what you can filter! Unlike SQL.
41+
// Always have your selects at the end of your filtering
42+
studentDf.join(gradesDf, studentDf.col("GPA").equalTo((gradesDf.col("gpa"))))
43+
// .drop("gpa").drop("GPA")
44+
.select(studentDf.col("student_name"),
45+
gradesDf.col("letter_grade"),
46+
studentDf.col("favorite_book_title"),
47+
studentDf.col("GPA")) // must have this for below filter to work
48+
.filter(col("GPA").between(2, 3.5)).show();
49+
// .filter(upper(col("letter_grade")).like("B")).show();
50+
2351

24-
df.show(10);
25-
2652
}
2753

2854

55+
2956
}

project5/src/main/resources/students.csv

+1-2
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,4 @@ student_id,student_name,State,GPA,favorite_book_title,working
1010
1190,Dan Iacovelli,CA,3.5,The Hunger Games,FALSE
1111
1200,Ned Alvin,CA,1.0,,TRUE
1212
1210,Sidney Ducote,FL,1.5,The Secret Garden,FALSE
13-
1220,Bobbie Shrader,FL,2.0,The Color Purple,FALSE
14-
,,,,,
13+
1220,Bobbie Shrader,FL,2.0,The Color Purple,FALSE

0 commit comments

Comments
 (0)