|
8 | 8 |
|
9 | 9 | ---
|
10 | 10 |
|
| 11 | +Initializing Spark |
| 12 | + |
| 13 | + |
| 14 | +```python |
| 15 | +#SparkContext |
| 16 | +from pyspark import SparkContext |
| 17 | +sc = SparkContext(master = 'local[2]') |
| 18 | +``` |
| 19 | + |
| 20 | + |
| 21 | +```python |
| 22 | +#Calculations With Variables |
| 23 | +sc.version |
| 24 | +sc.pythonVer |
| 25 | +sc.master |
| 26 | +str(sc.sparkHome) |
| 27 | +str(sc.sparkUser()) |
| 28 | +sc.appName |
| 29 | +sc.applicationId |
| 30 | +sc.defaultParallelism |
| 31 | +sc.defaultMinPartitions |
| 32 | +``` |
| 33 | + |
| 34 | + |
| 35 | +```python |
| 36 | +#Configuration |
| 37 | +from pyspark import SparkConf, SparkContext |
| 38 | +conf = (SparkConf().setMaster("local").setAppName("My app").set("spark.executor.memory", "1g")) |
| 39 | +sc = SparkContext(conf = conf) |
| 40 | +``` |
| 41 | + |
| 42 | + |
| 43 | +```python |
| 44 | +Loading Data |
| 45 | +``` |
| 46 | + |
| 47 | + |
| 48 | +```python |
| 49 | +#Parallelized Collections |
| 50 | +rdd = sc.parallelize([('a',7),('a',2),('b',2)]) |
| 51 | +rdd2 = sc.parallelize([('a',2),('d',1),('b',1)]) |
| 52 | +rdd3 = sc.parallelize(range(100)) |
| 53 | +rdd4 = sc.parallelize([("a",["x","y","z"]),("b",["p", "r"])]) |
| 54 | +``` |
| 55 | + |
| 56 | + |
| 57 | +```python |
| 58 | +#External Data |
| 59 | +textFile = sc.textFile("/my/directory/*.txt") |
| 60 | +textFile2 = sc.wholeTextFiles("/my/directory/") |
| 61 | +``` |
| 62 | + |
| 63 | + |
| 64 | +```python |
| 65 | +Selecting Data |
| 66 | +``` |
| 67 | + |
| 68 | + |
| 69 | +```python |
| 70 | +#Getting |
| 71 | +rdd.collect() #[('a', 7), ('a', 2), ('b', 2)] |
| 72 | +rdd.take(2) #[('a', 7), ('a', 2)] |
| 73 | +rdd.first() #('a', 7) |
| 74 | +rdd.top(2) #[('b', 2), ('a', 7)] |
| 75 | + |
| 76 | +#Sampling |
| 77 | +rdd3.sample(False, 0.15, 81).collect() #[3,4,27,31,40,41,42,43,60,76,79,80,86,97] |
| 78 | + |
| 79 | +#Filtering |
| 80 | +rdd.filter(lambda x: "a" in x).collect() #[('a',7),('a',2)] |
| 81 | +rdd5.distinct().collect() #['a',2,'b',7] |
| 82 | +rdd.keys().collect() #['a', 'a', 'b'] |
| 83 | + |
| 84 | +#Iterating |
| 85 | +def g(x): print(x) |
| 86 | +rdd.foreach(g) |
| 87 | +``` |
| 88 | + |
| 89 | +'''''' |
| 90 | +('a', 7) |
| 91 | +('b', 2) |
| 92 | +('a', 2) |
| 93 | +''''''' |
| 94 | + |
| 95 | + |
| 96 | +```python |
| 97 | + |
| 98 | +``` |
| 99 | + |
11 | 100 | #
|
12 | 101 |
|
13 | 102 | #
|
|
0 commit comments