-
Notifications
You must be signed in to change notification settings - Fork 471
/
pyspark-session-2020-02-03.txt
executable file
·93 lines (92 loc) · 2.67 KB
/
pyspark-session-2020-02-03.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
mparsian@Mahmouds-MacBook ~/spark-2.4.4 $ ./bin/pyspark
Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 2.4.4
/_/
Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
SparkSession available as 'spark'.
>>>
>>>
>>> numbers = [1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30]
>>>
>>>
>>> numbers
[1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30]
>>> len(numbers)
56
>>> rdd = spark.sparkContext.parallelize(numbers)
>>> rdd.count()
56
>>> rdd.collect()
[1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30]
>>>
>>>
>>> def min_max_count(partition):
... first_time = True
... count = 0
... for n in partition:
... count += 1
... if first_time == True:
... min2 = n
... max2 = n
... first_time = False
... else:
... min2 = min(min2, n)
... max2 = max(max2, n)
... return (min2, max2, count)
...
>>>
>>> target = rdd.mapPartitions(min_max_count)
>>> target.count()
24
>>> target.collect()
[-2, 3, 7, -2, 3, 7, -2, 3, 7, -20, 30, 7, -2, 3, 7, -2, 3, 7, -2, 3, 7, -20, 30, 7]
>>>
>>>
>>> def min_max_count(partition):
... first_time = True
... count = 0
... for n in partition:
... count += 1
... if first_time == True:
... min2 = n
... max2 = n
... first_time = False
... else:
... min2 = min(min2, n)
... max2 = max(max2, n)
... return [(min2, max2, count)]
...
>>>
>>> target = rdd.mapPartitions(min_max_count)
>>> target.collect()
[(-2, 3, 7), (-2, 3, 7), (-2, 3, 7), (-20, 30, 7), (-2, 3, 7), (-2, 3, 7), (-2, 3, 7), (-20, 30, 7)]
>>>
>>> rdd.getNumPartitions()
8
>>> rdd = spark.sparkContext.parallelize(numbers, 4)
>>> rdd.getNumPartitions()
4
>>> target = rdd.mapPartitions(min_max_count)
>>> target.collect()
[(-2, 3, 14), (-20, 30, 14), (-2, 3, 14), (-20, 30, 14)]
>>>
>>>
>>>
>>> def add_t3(x, y):
... count = x[2] + y[2]
... min2 = min(x[0], y[0])
... max2 = max(x[1], y[1])
... return (min2, max2, count)
...
>>>
>>> add_t3( (2, 5, 40), (7, 50, 60))
(2, 50, 100)
>>> final_result = target.reduce(add_t3)
>>> final_result
(-20, 30, 56)
>>>
>>>