-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathwrapper_workflow1.py
34 lines (26 loc) · 2.33 KB
/
wrapper_workflow1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env python
import sys,os,getpass
from subprocess import Popen
base_hdfs_path ="/user"
jar_path = "target/scala-2.11/BillAnalysis-assembly-2.0.jar"
Popen("sbt assembly",shell=True).wait()
Popen("spark-submit --class org.princeton.billmatch.ExtractCandidates --master yarn --deploy-mode client --queue production --num-executors 40 --executor-cores 3 --executor-memory 16g --driver-memory 20g {}".format(jar_path),shell=True).wait()
Popen("python prepare_valid_pairs.py {}".format(os.path.join(base_hdfs_path,getpass.getuser(),"valid_pairs")),shell=True).wait()
Popen("python prepare_config_files.py {} {} {}".format(os.path.join(base_hdfs_path,getpass.getuser(),"bills_combined"), os.path.join(base_hdfs_path,getpass.getuser(),"valid_pairs"),os.path.join(base_hdfs_path,getpass.getuser(),"output_sample")),shell=True).wait()
for comb in range(14):
Popen("cp workflow1_billAnalyzer"+str(comb)+".conf src/main/resources/workflow1_billAnalyzer.conf",shell=True).wait()
Popen("sbt assembly",shell=True).wait()
Popen("spark-submit --class org.princeton.billmatch.BillAnalyzer --master yarn --deploy-mode client --queue production --num-executors 40 --executor-cores 3 --executor-memory 15g --driver-memory 20g {}".format(jar_path),shell=True).wait()
#run harvester changing app=workflow1
Popen("spark-submit --class org.princeton.billmatch.utils.HarvestOutput --master yarn --deploy-mode client --queue production --num-executors 40 --executor-cores 3 --executor-memory 16g --driver-memory 20g {}".format(jar_path),shell=True).wait()
#cleanup folders
config = open("src/main/resources/workflow1_billAnalyzer.conf","r").readlines()
for line in config:
if "outputMainFile" in line: outputMainFile = line.split("=")[-1].lstrip(" \"").rstrip("\",\n")
outputMainFile = "_".join(outputMainFile.split("_")[:-1])
Popen("hdfs dfs -rmr "+outputMainFile+"_*",shell=True).wait()
print("The workflow1 results are in folder {}".format(outputMainFile))
#do postprocessing
Popen("spark-submit --class org.princeton.billmatch.utils.Postprocessor --master yarn --deploy-mode client --queue production --num-executors 40 --executor-cores 3 --executor-memory 16g --driver-memory 20g {}".format(jar_path),shell=True).wait()
#print("The workflow1 skim results are in folder {}".format(outputMainFile))
#print("The workflow1 light results are in folder {}".format(outputMainFile))