BC changes and autotuner support for GPU backend

GraphIt-DSL · Apr 21, 2020 · 08c3ae8 · 08c3ae8
1 parent 6754d68
commit 08c3ae8
Show file tree

Hide file tree

Showing 27 changed files with 968 additions and 101 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,6 @@
 .settings/
 .idea/
 build/
+autotune/*.json
+*.graphit_bin
+*.graphit_sbin
diff --git a/autotune/compile_gpu.sh b/autotune/compile_gpu.sh
@@ -1,2 +1,3 @@
 python ../build/bin/graphitc.py -a algotorun.gt -f schedule_0 -o test.cu
 /usr/local/cuda/bin/nvcc  -ccbin /usr/bin/c++ -std=c++11 -I ../src/runtime_lib/ -o test -Xcompiler "-w" -O3 test.cu -DNUM_CTA=80 -DCTA_SIZE=512 -Wno-deprecated-gpu-targets -gencode arch=compute_70,code=sm_70 --use_fast_math -Xptxas "-v -dlcm=ca --maxrregcount=64" -rdc=true -DFRONTIER_MULTIPLIER=3
+#/usr/local/cuda/bin/nvcc  -ccbin /usr/bin/c++ -std=c++11 -I ../src/runtime_lib/ -o test -Xcompiler "-w" -O3 test.cu -DNUM_CTA=60 -DCTA_SIZE=512 -Wno-deprecated-gpu-targets -gencode arch=compute_61,code=sm_61 --use_fast_math -Xptxas "-v -dlcm=ca --maxrregcount=64" -rdc=true -DFRONTIER_MULTIPLIER=2
diff --git a/autotune/gpu_apps/bfs.gt b/autotune/gpu_apps/bfs.gt
@@ -0,0 +1,41 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
diff --git a/autotune/gpu_apps/cc.gt b/autotune/gpu_apps/cc.gt
@@ -0,0 +1,55 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+
+const vertices : vertexset{Vertex} = edges.getVertices();
+const IDs : vector{Vertex}(int) = 1;
+
+const update: vector[1](int);
+
+func updateEdge(src : Vertex, dst : Vertex)
+    var src_id: Vertex = IDs[src];
+    var dst_id: Vertex = IDs[dst];
+
+    IDs[dst_id] min= IDs[src_id];
+    IDs[src_id] min= IDs[dst_id];
+end
+
+func init(v : Vertex)
+     IDs[v] = v;
+end
+
+func pjump(v: Vertex) 
+    var y: Vertex = IDs[v];
+    var x: Vertex = IDs[y];
+    if x != y
+        IDs[v] = x;
+        update[0] = 1;
+    end
+end
+
+func main()
+    var n : int = edges.getVertices();
+    for trail in 0:10
+        var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);
+        startTimer();
+        vertices.apply(init);
+        #s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge,IDs);
+	    delete frontier;
+	    frontier = output;
+            update[0] = 1;
+            while update[0] != 0
+		update[0] = 0;
+		vertices.apply(pjump);
+            end
+        end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+        print "elapsed time: ";
+        print elapsed_time;
+    end
+end
+
+
diff --git a/autotune/gpu_apps/pagerank.gt b/autotune/gpu_apps/pagerank.gt
@@ -0,0 +1,53 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const old_rank : vector{Vertex}(float) = 1.0/vertices.size();
+const new_rank : vector{Vertex}(float) = 0.0;
+const out_degree : vector {Vertex}(int) = edges.getOutDegrees();
+const contrib : vector{Vertex}(float) = 0.0;
+const error : vector{Vertex}(float) = 0.0;
+const damp : float = 0.85;
+const beta_score : float = (1.0 - damp) / vertices.size();
+
+func computeContrib(v : Vertex)
+    contrib[v] = old_rank[v] / out_degree[v];
+end
+
+func updateEdge(src : Vertex, dst : Vertex)
+    new_rank[dst] += contrib[src];
+end
+
+func updateVertex(v : Vertex)
+    var old_score : float = old_rank[v];
+    new_rank[v] = beta_score + damp*(new_rank[v]);
+    error[v] = fabs(new_rank[v] - old_rank[v]);
+    old_rank[v] = new_rank[v];
+    new_rank[v] = 0.0;
+end
+
+func printRank(v : Vertex)
+    print old_rank[v];
+end
+
+func reset(v: Vertex)
+    old_rank[v] = 1.0/vertices.size();
+    new_rank[v] = 0.0;
+end
+
+func main()
+    for trail in 0:10
+    	startTimer();
+        vertices.apply(reset);
+    	#s0# for i in 0:20
+    	    vertices.apply(computeContrib);
+            #s1# edges.apply(updateEdge);
+            vertices.apply(updateVertex);
+    	end
+
+    	var elapsed_time : float = stopTimer();
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
diff --git a/autotune/graphit_gpu_autotuner.py b/autotune/graphit_gpu_autotuner.py
@@ -29,57 +29,117 @@ def manipulator(self):
         Define the search space by creating a                                        
         ConfigurationManipulator                                                     
         """
+        manipulator = ConfigurationManipulator()
+        if self.args.edge_only:
+            #manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT', 'EDGE_ONLY']))
+            manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'EDGE_ONLY']))
+            manipulator.add_parameter(EnumParameter('EB_0', ['ENABLED', 'DISABLED']))
+            manipulator.add_parameter(IntegerParameter('BS_0', 1, 20))
+        else:
+            #manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
+            manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM']))
 
+        manipulator.add_parameter(EnumParameter('direction_0', ['PUSH', 'PULL']))
+        manipulator.add_parameter(EnumParameter('dedup_0', ['ENABLED', 'DISABLED']))
+        manipulator.add_parameter(EnumParameter('frontier_output_0', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
+        manipulator.add_parameter(EnumParameter('pull_rep_0', ['BITMAP', 'BOOLMAP']))
 
+        if self.args.hybrid_schedule:
+            #manipulator.add_parameter(EnumParameter('LB_1', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
+            manipulator.add_parameter(EnumParameter('LB_1', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM']))
+
+            manipulator.add_parameter(EnumParameter('direction_1', ['PUSH', 'PULL']))
+            manipulator.add_parameter(EnumParameter('dedup_1', ['ENABLED', 'DISABLED']))
+            manipulator.add_parameter(EnumParameter('frontier_output_1', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
+            manipulator.add_parameter(EnumParameter('pull_rep_1', ['BITMAP', 'BOOLMAP']))
+
+            # We also choose the hybrid schedule threshold here
+            manipulator.add_parameter(IntegerParameter('threshold', 0, 1000))
+
+
 
-        manipulator = ConfigurationManipulator()
-        manipulator.add_parameter(
-            EnumParameter('LB', 
-                          ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
-
-        #'edge-aware-dynamic-vertex-parallel' not supported with the latest g++ cilk implementation
-        manipulator.add_parameter(EnumParameter('direction', ['PUSH', 'PULL']))
-        manipulator.add_parameter(EnumParameter('dedup', ['ENABLED', 'DISABLED']))
-        manipulator.add_parameter(EnumParameter('frontier_output', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
         # adding new parameters for PriorityGraph (Ordered GraphIt) 
-        manipulator.add_parameter(IntegerParameter('delta', 1, self.args.max_delta))
+	# Currently since delta is allowed to be configured only once for the entire program, we will make a single decision even if the schedule is hybrid
+        if self.args.tune_delta:
+            manipulator.add_parameter(IntegerParameter('delta', 1, self.args.max_delta))
+
+
+        if self.args.kernel_fusion:
+            manipulator.add_parameter(EnumParameter('kernel_fusion', ['DISABLED', 'ENABLED']))
 
-        manipulator.add_parameter(EnumParameter('kernel_fusion', ['DISABLED', 'ENABLED']))
-        manipulator.add_parameter(EnumParameter('pull_rep', ['BITMAP', 'BOOLMAP']))
         return manipulator
 
 
     def write_cfg_to_schedule(self, cfg):
         #write into a schedule file the configuration
-        direction = cfg['direction']
-        delta = cfg['delta']
-        dedup = cfg['dedup']
-        frontier_output = cfg['frontier_output']
-        kernel_fusion = cfg['kernel_fusion']
-        pull_rep = cfg['pull_rep']
-        LB = cfg['LB']
+
+        direction_0 = cfg['direction_0']
+        if self.args.tune_delta:
+            delta_0 = cfg['delta']
+        dedup_0 = cfg['dedup_0']
+        frontier_output_0 = cfg['frontier_output_0']
+        pull_rep_0 = cfg['pull_rep_0']
+        LB_0 = cfg['LB_0']
 
         new_schedule = "schedule:\n"
+
         new_schedule += "SimpleGPUSchedule s1;\n";
-        new_schedule += "s1.configLoadBalance(" + LB + ");\n"
-        new_schedule += "s1.configFrontierCreation(" + frontier_output + ");\n"
-        if direction == "PULL":
-            new_schedule += "s1.configDirection(PULL, " + pull_rep + ");\n"
+        if LB_0 == "EDGE_ONLY" and cfg['EB_0'] == "ENABLED":
+            new_schedule += "s1.configLoadBalance(EDGE_ONLY, BLOCKED, " + str(int(int(self.args.num_vertices)/cfg['BS_0'])) + ");\n"
+            direction_0 = "PUSH"
+        else:
+            new_schedule += "s1.configLoadBalance(" + LB_0 + ");\n"
+        new_schedule += "s1.configFrontierCreation(" + frontier_output_0 + ");\n"
+        if direction_0 == "PULL":
+            new_schedule += "s1.configDirection(PULL, " + pull_rep_0 + ");\n"
         else:
             new_schedule += "s1.configDirection(PUSH);\n"
-        new_schedule += "s1.configDelta(" + str(delta) + ");\n"
-        new_schedule += "s1.configDeduplication(" + dedup + ");\n"
-        new_schedule += "program->applyGPUSchedule(\"s0:s1\", s1);\n"
-        new_schedule += "SimpleGPUSchedule s0;\n"
-        new_schedule += "s0.configKernelFusion(" + kernel_fusion + ");\n"
-	# We will currently not apply this. Use this after kernel fusion is fixed
-        #new_schedule += "program->applyGPUSchedule(\"s0\", s0);\n"
+        if self.args.tune_delta:
+            new_schedule += "s1.configDelta(" + str(delta_0) + ");\n"
+        new_schedule += "s1.configDeduplication(" + dedup_0 + ");\n"
+
+        if self.args.hybrid_schedule:
+            direction_1 = cfg['direction_1']
+            if self.args.tune_delta:
+                delta_1 = cfg['delta']
+            dedup_1 = cfg['dedup_1']
+            frontier_output_1 = cfg['frontier_output_1']
+            pull_rep_1 = cfg['pull_rep_1']
+            LB_1 = cfg['LB_1']
+
+            #threshold = self.args.hybrid_threshold
+            threshold = cfg['threshold']
+
+            new_schedule += "SimpleGPUSchedule s2;\n";
+            new_schedule += "s2.configLoadBalance(" + LB_1 + ");\n"
+            new_schedule += "s2.configFrontierCreation(" + frontier_output_1 + ");\n"
+            if direction_1 == "PULL":
+                new_schedule += "s2.configDirection(PULL, " + pull_rep_1 + ");\n"
+            else:
+                new_schedule += "s2.configDirection(PUSH);\n"
+            if self.args.tune_delta:
+                new_schedule += "s2.configDelta(" + str(delta_1) + ");\n"
+            new_schedule += "s2.configDeduplication(" + dedup_1 + ");\n"
+
+            new_schedule += "HybridGPUSchedule h1(INPUT_VERTEXSET_SIZE, " + str(threshold/1000) + ", s1, s2);\n"
+            new_schedule += "program->applyGPUSchedule(\"s0:s1\", h1);\n"
+
+        else:
+            new_schedule += "program->applyGPUSchedule(\"s0:s1\", s1);\n"
+
+
+
+        if self.args.kernel_fusion:
+            kernel_fusion = cfg['kernel_fusion']
+            new_schedule += "SimpleGPUSchedule s0;\n"
+            new_schedule += "s0.configKernelFusion(" + kernel_fusion + ");\n"
+            new_schedule += "program->applyGPUSchedule(\"s0\", s0);\n"
 
         print (cfg)
-        print (new_schedule)
+        #print (new_schedule)
 
         self.new_schedule_file_name = 'schedule_0' 
-        print (self.new_schedule_file_name)
+        #print (self.new_schedule_file_name)
         f1 = open (self.new_schedule_file_name, 'w')
         f1.write(new_schedule)
         f1.close()
@@ -174,7 +234,7 @@ def compile_and_run(self, desired_result, input, limit):
         Compile and run a given configuration then                                   
         return performance                                                           
         """
-        print ("input graph: " + self.args.graph)
+        # print ("input graph: " + self.args.graph)
 
         cfg = desired_result.configuration.data
 
@@ -190,7 +250,7 @@ def compile_and_run(self, desired_result, input, limit):
     def save_final_config(self, configuration):
         """called at the end of tuning"""
         print ('Final Configuration:', configuration.data)
-        self.manipulator().save_to_file(configuration.data,'final_config.json')
+        self.manipulator().save_to_file(configuration.data, self.args.final_config)
 
 
 
@@ -200,11 +260,21 @@ def save_final_config(self, configuration):
     parser.add_argument('--start_vertex', type=str, default="0", help="Start vertex if applicable")
 
     parser.add_argument('--algo_file', type=str, required=True, help='input algorithm file')
+    parser.add_argument('--final_config', type=str, help='Final config file', default="final_config.json")
     parser.add_argument('--default_schedule_file', type=str, required=False, default="", help='default schedule file')
     parser.add_argument('--runtime_limit', type=float, default=300, help='a limit on the running time of each program')
     parser.add_argument('--max_delta', type=int, default=800000, help='maximum delta used for priority coarsening')
     parser.add_argument('--memory_limit', type=int, default=-1,help='set memory limit on unix based systems [does not quite work yet]')    
     parser.add_argument('--killed_process_report_runtime_limit', type=int, default=0, help='reports runtime_limit when a process is killed by the shell. 0 for disable (default), 1 for enable')
+
+    parser.add_argument('--kernel_fusion', type=bool, default=False, help='Choose if you want to also tune kernel fusion')
+    parser.add_argument('--hybrid_schedule', type=bool, default=False, help='Choose if you want to also explore hybrid schedules')
+    parser.add_argument('--edge_only', type=bool, default=False, help='Choose if you want to also enable EDGE_ONLY schedules')
+    parser.add_argument('--num_vertices', type=int, required=True, help='Supply number of vertices in the graph')
+    parser.add_argument('--tune_delta', type=bool, default=False, help='Also tune the delta parameter')
+    parser.add_argument('--hybrid_threshold', type=int, default=1000, help='Threshold value on 1000')
+
+
     args = parser.parse_args()
     # pass the argumetns into the tuner
     GraphItTuner.main(args)

diff --git a/include/graphit/backend/codegen_gpu/assign_function_context.h b/include/graphit/backend/codegen_gpu/assign_function_context.h
@@ -20,6 +20,7 @@ class AssignFunctionContext : mir::MIRVisitor {
 		void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr);
 		void visit(mir::PullEdgeSetApplyExpr::Ptr);
 		void visit(mir::VertexSetApplyExpr::Ptr);
+		void visit(mir::VertexSetWhereExpr::Ptr);
 	private:
 		MIRContext *mir_context_;
 };

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -69,7 +69,7 @@ class CodeGenGPU: public mir::MIRVisitor{
 	void genPropertyArrayAlloca(mir::VarDecl::Ptr);
 
 	void genFusedWhileLoop(mir::WhileStmt::Ptr);
-	void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
+	virtual void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
 
 	EdgesetApplyFunctionDeclGenerator* edgeset_apply_func_gen_;
 
@@ -142,6 +142,12 @@ class CodeGenGPU: public mir::MIRVisitor{
 
 	virtual void visit(mir::EnqueueVertex::Ptr) override;
 
+        virtual void visit(mir::VertexSetWhereExpr::Ptr) override;
+
+
+	virtual void visit(mir::ListType::Ptr) override;
+	virtual void visit(mir::ListAllocExpr::Ptr) override;
+
 	void genPriorityUpdateOperator(mir::PriorityUpdateOperator::Ptr); 
 
 };
@@ -179,14 +185,15 @@ class CodeGenGPUFusedKernel: public CodeGenGPU {
 		}
 		current_while_stmt->used_priority_queues.push_back(var);
 	}
-	void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
+	virtual void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr) override;
 	virtual void visit(mir::StmtBlock::Ptr) override;
 	virtual void visit(mir::AssignStmt::Ptr) override;
 	virtual void visit(mir::VarDecl::Ptr) override;
 	virtual void visit(mir::VarExpr::Ptr) override;
 	virtual void visit(mir::PrintStmt::Ptr) override;
 	virtual void visit(mir::HybridGPUStmt::Ptr) override;
 	virtual void visit(mir::VertexSetDedupExpr::Ptr) override;
+	virtual void visit(mir::VertexSetApplyExpr::Ptr) override;
 
 	std::string var_name (std::string var) {
 		//return current_kernel_name + "_" + var;