-
Notifications
You must be signed in to change notification settings - Fork 10
/
tpcds-benchmark-with-emrspark.yaml
182 lines (182 loc) · 5.66 KB
/
tpcds-benchmark-with-emrspark.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
apiVersion: "sparkoperator.k8s.io/v1beta2"
kind: SparkApplication
metadata:
name: tpcds-benchmark-emrspark-10t
namespace: default
spec:
type: Scala
mode: cluster
image: registry.cn-beijing.aliyuncs.com/zf-spark/spark-2.4.5:for-tpc-ds-2
imagePullPolicy: Always
mainClass: com.databricks.spark.sql.perf.tpcds.TPCDS_Standalone
mainApplicationFile: "oss://<YOUR-BUCKET>/jars/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar"
arguments:
- "--dataset_location"
- "oss://<YOUR-BUCKET>/datasets/"
- "--output_location"
- "oss://<YOUR-BUCKET>/outputs/ack-pr-10t-emr"
- "--iterations"
- "1"
- "--shuffle_partitions"
- "1000"
- "--scale_factor"
- "10000"
- "--regenerate_dataset"
- "false"
- "--regenerate_metadata"
- "false"
- "--only_generate_data_and_meta"
- "false"
- "--format"
- "parquet"
- "--query_exclude_list"
- "q14a,q14b,q67"
sparkVersion: 2.4.5
restartPolicy:
type: Never
hadoopConf:
"fs.oss.impl": "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem"
"fs.oss.endpoint": "<YOUR-OSS-ENDPOINT>"
"fs.oss.accessKeyId": "<YOUR-ACCESS-KEY-ID>"
"fs.oss.accessKeySecret": "<YOUR-ACCESS-KEY-SECRET>"
hive.metastore.uris: thrift://service-hive-metastore.default:9083
hive.metastore.client.socket.timeout: 600s
sparkConf:
spark.eventLog.enabled: "true"
spark.eventLog.dir: "oss://<YOUR-BUCKET>/spark/eventlogs"
spark.driver.extraJavaOptions: "-XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps"
spark.driver.maxResultSize: 40g
spark.executor.extraJavaOptions: "-XX:MaxDirectMemorySize=6g -XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps"
spark.locality.wait.node: "0"
spark.locality.wait.process: "0"
spark.locality.wait.rack: "0"
spark.locality.wait: "0"
spark.memory.fraction: "0.8"
spark.memory.offHeap.enabled: "false"
spark.memory.offHeap.size: "17179869184"
spark.sql.adaptive.bloomFilterJoin.enabled: "false"
spark.sql.adaptive.enabled: "false"
spark.sql.analyze.column.async.delay: "200"
spark.sql.auto.reused.cte.enabled: "true"
spark.sql.broadcastTimeout: "3600"
spark.sql.columnVector.offheap.enabled: "false"
spark.sql.crossJoin.enabled: "true"
spark.sql.delete.optimizeInSubquery: "true"
spark.sql.dynamic.runtime.filter.bbf.enabled: "false"
spark.sql.dynamic.runtime.filter.enabled: "true"
spark.sql.dynamic.runtime.filter.exact.enabled: "true"
spark.sql.dynamic.runtime.filter.table.size.lower.limit: "1069547520"
spark.sql.dynamic.runtime.filter.table.size.upper.limit: "5368709120"
spark.sql.files.openCostInBytes: "34108864"
spark.sql.inMemoryColumnarStorage.compressed: "true"
spark.sql.join.preferNativeJoin: "false"
spark.sql.native.codecache: "true"
spark.sql.native.codegen.wholeStage: "false"
spark.sql.native.nativewrite: "false"
spark.sql.pkfk.optimize.enable: "true"
spark.sql.pkfk.riJoinElimination: "true"
spark.sql.shuffle.partitions: "1000"
spark.sql.simplifyDecimal.enabled: "true"
spark.sql.sources.parallelPartitionDiscovery.parallelism: "432"
spark.sql.sources.parallelPartitionDiscovery.threshold: "32"
spark.shuffle.reduceLocality.enabled: "false"
spark.shuffle.service.enabled: "false"
spark.dynamicAllocation.enabled: "false"
spark.local.dir: /mnt/diskb/spark-data,/mnt/diskc/spark-data,/mnt/diskd/spark-data,/mnt/diske/spark-data,/mnt/diskf/spark-data,/mnt/diskg/spark-data,/mnt/diskh/spark-data,/mnt/diski/spark-data,/mnt/diskj/spark-data,/mnt/diskk/spark-data,/mnt/diskl/spark-data,/mnt/diskm/spark-data
spark.shuffle.manager: org.apache.spark.shuffle.sort.SortShuffleManager
volumes:
- name: diskb
hostPath:
path: /mnt/diskb
type: Directory
- name: diskc
hostPath:
path: /mnt/diskc
type: Directory
- name: diskd
hostPath:
path: /mnt/diskd
type: Directory
- name: diske
hostPath:
path: /mnt/diske
type: Directory
- name: diskf
hostPath:
path: /mnt/diskf
type: Directory
- name: diskg
hostPath:
path: /mnt/diskg
type: Directory
- name: diskh
hostPath:
path: /mnt/diskh
type: Directory
- name: diski
hostPath:
path: /mnt/diski
type: Directory
- name: diskj
hostPath:
path: /mnt/diskj
type: Directory
- name: diskk
hostPath:
path: /mnt/diskk
type: Directory
- name: diskl
hostPath:
path: /mnt/diskl
type: Directory
- name: diskm
hostPath:
path: /mnt/diskm
type: Directory
driver:
cores: 15
coreLimit: 15000m
memory: 50g
labels:
version: 2.4.5
serviceAccount: spark
env:
- name: TZ
value: "Asia/Shanghai"
executor:
cores: 4
coreLimit: 6000m
instances: 20
memory: 24g
memoryOverhead: 10g
deleteOnTermination: false
labels:
version: 2.4.5
env:
- name: TZ
value: "Asia/Shanghai"
volumeMounts:
- mountPath: /mnt/diskb
name: diskb
- mountPath: /mnt/diskc
name: diskc
- mountPath: /mnt/diskd
name: diskd
- mountPath: /mnt/diske
name: diske
- mountPath: /mnt/diskf
name: diskf
- mountPath: /mnt/diskg
name: diskg
- mountPath: /mnt/diskh
name: diskh
- mountPath: /mnt/diski
name: diski
- mountPath: /mnt/diskj
name: diskj
- mountPath: /mnt/diskk
name: diskk
- mountPath: /mnt/diskl
name: diskl
- mountPath: /mnt/diskm
name: diskm