diff --git a/.gitignore b/.gitignore
index 467e823f..8c048adf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,7 @@
 .settings/
 .idea/
 build/
+autotune/*.json
+*.graphit_bin
+*.graphit_sbin
 cmake-build-debug/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 125d7720..23b87f99 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -83,9 +83,19 @@ add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/graphit.py
 	VERBATIM
 )
 
+find_package(CUDA QUIET)
+
+add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/gpu_tests/all_gpu_tests.py
+	COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/gpu_tests
+	COMMAND sed -e s?\$\{NVCC_COMPILER\}?${CUDA_NVCC_EXECUTABLE}?g -e s?\$\{GRAPHIT_SOURCE_DIRECTORY\}?${CMAKE_SOURCE_DIR}?g -e s?\$\{CXX_COMPILER\}?${CMAKE_CXX_COMPILER}?g -e s?\$\{GRAPHIT_BUILD_DIRECTORY\}?${CMAKE_BINARY_DIR}?g ${CMAKE_SOURCE_DIR}/test/gpu_tests/all_gpu_tests.py > ${CMAKE_BINARY_DIR}/gpu_tests/all_gpu_tests.py
+	DEPENDS ${CMAKE_SOURCE_DIR}/test/gpu_tests/all_gpu_tests.py
+	VERBATIM
+)
+	
 add_custom_target(copy_graphitc_py ALL DEPENDS ${GRAPHITC_PY})
 add_custom_target(copy_python_tests ALL DEPENDS ${CMAKE_BINARY_DIR}/python_tests/test_with_schedules.py ${CMAKE_BINARY_DIR}/python_tests/test.py ${CMAKE_BINARY_DIR}/python_tests/pybind_test.py)
 add_custom_target(copy_graphit_py ALL DEPENDS ${CMAKE_BINARY_DIR}/graphit.py)
+add_custom_target(copy_all_gpu_tests_py ALL DEPENDS ${CMAKE_BINARY_DIR}/gpu_tests/all_gpu_tests.py)
 
 configure_file(src/main.cpp ${CMAKE_BINARY_DIR}/bin/main.cpp COPYONLY)
 configure_file(test/library_test_drivers/library_test_driver_cpp.txt ${CMAKE_BINARY_DIR}/bin/library_test_driver.cpp COPYONLY)
diff --git a/apps/cc_pjump.gt b/apps/cc_pjump.gt
new file mode 100644
index 00000000..152dd81b
--- /dev/null
+++ b/apps/cc_pjump.gt
@@ -0,0 +1,55 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+
+const vertices : vertexset{Vertex} = edges.getVertices();
+const IDs : vector{Vertex}(int) = 1;
+
+const update: vector[1](int);
+
+func updateEdge(src : Vertex, dst : Vertex)
+    var src_id: Vertex = IDs[src];
+    var dst_id: Vertex = IDs[dst];
+
+    IDs[dst_id] min= IDs[src_id];
+    IDs[src_id] min= IDs[dst_id];
+end
+
+func init(v : Vertex)
+     IDs[v] = v;
+end
+
+func pjump(v: Vertex) 
+    var y: Vertex = IDs[v];
+    var x: Vertex = IDs[y];
+    if x != y
+        IDs[v] = x;
+        update[0] = 1;
+    end
+end
+
+func main()
+    var n : int = edges.getVertices();
+    for trail in 0:10
+        var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);
+        startTimer();
+        vertices.apply(init);
+        while (frontier.getVertexSetSize() != 0)
+            #s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge,IDs);
+	    delete frontier;
+	    frontier = output;
+            update[0] = 1;
+            while update[0] != 0
+		update[0] = 0;
+		vertices.apply(pjump);
+            end
+        end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+        print "elapsed time: ";
+        print elapsed_time;
+    end
+end
+
+
diff --git a/autotune/cmd.sh b/autotune/cmd.sh
new file mode 100644
index 00000000..02c515e2
--- /dev/null
+++ b/autotune/cmd.sh
@@ -0,0 +1 @@
+python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-LiveJournal1.mtx --algo_file gpu_apps/sssp_delta_stepping.gt --killed_process_report_runtime_limit 1 --max_delta 100 --runtime_limit 10 --stop-after 900
diff --git a/autotune/compile_gpu.sh b/autotune/compile_gpu.sh
new file mode 100644
index 00000000..09475db8
--- /dev/null
+++ b/autotune/compile_gpu.sh
@@ -0,0 +1,3 @@
+python ../build/bin/graphitc.py -a algotorun.gt -f schedule_0 -o test.cu
+/usr/local/cuda/bin/nvcc  -ccbin /usr/bin/c++ -std=c++11 -I ../src/runtime_lib/ -o test -Xcompiler "-w" -O3 test.cu -DNUM_CTA=80 -DCTA_SIZE=512 -Wno-deprecated-gpu-targets -gencode arch=compute_70,code=sm_70 --use_fast_math -Xptxas "-v -dlcm=ca --maxrregcount=64" -rdc=true -DFRONTIER_MULTIPLIER=3
+#/usr/local/cuda/bin/nvcc  -ccbin /usr/bin/c++ -std=c++11 -I ../src/runtime_lib/ -o test -Xcompiler "-w" -O3 test.cu -DNUM_CTA=60 -DCTA_SIZE=512 -Wno-deprecated-gpu-targets -gencode arch=compute_61,code=sm_61 --use_fast_math -Xptxas "-v -dlcm=ca --maxrregcount=64" -rdc=true -DFRONTIER_MULTIPLIER=2
diff --git a/autotune/gpu_apps/bfs.gt b/autotune/gpu_apps/bfs.gt
new file mode 100644
index 00000000..5025d779
--- /dev/null
+++ b/autotune/gpu_apps/bfs.gt
@@ -0,0 +1,41 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
diff --git a/autotune/gpu_apps/cc.gt b/autotune/gpu_apps/cc.gt
new file mode 100644
index 00000000..05422e0d
--- /dev/null
+++ b/autotune/gpu_apps/cc.gt
@@ -0,0 +1,55 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+
+const vertices : vertexset{Vertex} = edges.getVertices();
+const IDs : vector{Vertex}(int) = 1;
+
+const update: vector[1](int);
+
+func updateEdge(src : Vertex, dst : Vertex)
+    var src_id: Vertex = IDs[src];
+    var dst_id: Vertex = IDs[dst];
+
+    IDs[dst_id] min= IDs[src_id];
+    IDs[src_id] min= IDs[dst_id];
+end
+
+func init(v : Vertex)
+     IDs[v] = v;
+end
+
+func pjump(v: Vertex) 
+    var y: Vertex = IDs[v];
+    var x: Vertex = IDs[y];
+    if x != y
+        IDs[v] = x;
+        update[0] = 1;
+    end
+end
+
+func main()
+    var n : int = edges.getVertices();
+    for trail in 0:10
+        var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);
+        startTimer();
+        vertices.apply(init);
+        #s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge,IDs);
+	    delete frontier;
+	    frontier = output;
+            update[0] = 1;
+            while update[0] != 0
+		update[0] = 0;
+		vertices.apply(pjump);
+            end
+        end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+        print "elapsed time: ";
+        print elapsed_time;
+    end
+end
+
+
diff --git a/autotune/gpu_apps/pagerank.gt b/autotune/gpu_apps/pagerank.gt
new file mode 100644
index 00000000..c171e078
--- /dev/null
+++ b/autotune/gpu_apps/pagerank.gt
@@ -0,0 +1,53 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const old_rank : vector{Vertex}(float) = 1.0/vertices.size();
+const new_rank : vector{Vertex}(float) = 0.0;
+const out_degree : vector {Vertex}(int) = edges.getOutDegrees();
+const contrib : vector{Vertex}(float) = 0.0;
+const error : vector{Vertex}(float) = 0.0;
+const damp : float = 0.85;
+const beta_score : float = (1.0 - damp) / vertices.size();
+
+func computeContrib(v : Vertex)
+    contrib[v] = old_rank[v] / out_degree[v];
+end
+
+func updateEdge(src : Vertex, dst : Vertex)
+    new_rank[dst] += contrib[src];
+end
+
+func updateVertex(v : Vertex)
+    var old_score : float = old_rank[v];
+    new_rank[v] = beta_score + damp*(new_rank[v]);
+    error[v] = fabs(new_rank[v] - old_rank[v]);
+    old_rank[v] = new_rank[v];
+    new_rank[v] = 0.0;
+end
+
+func printRank(v : Vertex)
+    print old_rank[v];
+end
+
+func reset(v: Vertex)
+    old_rank[v] = 1.0/vertices.size();
+    new_rank[v] = 0.0;
+end
+
+func main()
+    for trail in 0:10
+    	startTimer();
+        vertices.apply(reset);
+    	#s0# for i in 0:20
+    	    vertices.apply(computeContrib);
+            #s1# edges.apply(updateEdge);
+            vertices.apply(updateVertex);
+    	end
+
+    	var elapsed_time : float = stopTimer();
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
diff --git a/autotune/gpu_apps/sssp_delta_stepping.gt b/autotune/gpu_apps/sssp_delta_stepping.gt
new file mode 100644
index 00000000..0cb31c7f
--- /dev/null
+++ b/autotune/gpu_apps/sssp_delta_stepping.gt
@@ -0,0 +1,38 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+const pq: priority_queue{Vertex}(int);
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+    var new_dist : int = dist[src] + weight;
+    pq.updatePriorityMin(dst, dist[dst], new_dist);
+end
+
+func printDist(v : Vertex)
+    print dist[v];
+end
+
+func reset(v: Vertex)
+    dist[v] = 2147483647;
+end
+
+func main()
+    for trail in 0:10
+        var start_vertex : int = atoi(argv[2]);
+        pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);
+        startTimer();
+        vertices.apply(reset);
+        dist[start_vertex] = 0;
+        #s0# while (pq.finished() == false)
+            var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue lowest priority nodes
+            #s1# edges.from(frontier).applyUpdatePriority(updateEdge);
+            delete frontier;
+        end
+        var elapsed_time : float = stopTimer();
+        print "elapsed time: ";
+        print elapsed_time;
+	delete pq;
+    end
+end
diff --git a/autotune/graphit_gpu_autotuner.py b/autotune/graphit_gpu_autotuner.py
new file mode 100644
index 00000000..ccaa070b
--- /dev/null
+++ b/autotune/graphit_gpu_autotuner.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python                                                           
+#
+# Autotune schedules for DeltaStepping in the GraphIt language
+#                                                                               
+
+# import adddeps  # fix sys.path
+import opentuner
+from opentuner import ConfigurationManipulator
+from opentuner import EnumParameter
+from opentuner import IntegerParameter
+from opentuner import MeasurementInterface
+from opentuner import Result
+from sys import exit
+import argparse
+
+py_graphitc_file = "../build/bin/graphitc.py"
+serial_compiler = "g++"
+
+#if using icpc for par_compiler, the compilation flags for CILK and OpenMP needs to be changed
+par_compiler = "g++"
+
+class GraphItTuner(MeasurementInterface):
+    new_schedule_file_name = ''
+    # a flag for testing if NUMA-aware schedule is specified
+
+
+    def manipulator(self):
+        """                                                                          
+        Define the search space by creating a                                        
+        ConfigurationManipulator                                                     
+        """
+        manipulator = ConfigurationManipulator()
+        if self.args.edge_only:
+            #manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT', 'EDGE_ONLY']))
+            manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'EDGE_ONLY']))
+            manipulator.add_parameter(EnumParameter('EB_0', ['ENABLED', 'DISABLED']))
+            manipulator.add_parameter(IntegerParameter('BS_0', 1, 20))
+        else:
+            #manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
+            manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM']))
+
+        manipulator.add_parameter(EnumParameter('direction_0', ['PUSH', 'PULL']))
+        manipulator.add_parameter(EnumParameter('dedup_0', ['ENABLED', 'DISABLED']))
+        manipulator.add_parameter(EnumParameter('frontier_output_0', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
+        manipulator.add_parameter(EnumParameter('pull_rep_0', ['BITMAP', 'BOOLMAP']))
+
+        if self.args.hybrid_schedule:
+            #manipulator.add_parameter(EnumParameter('LB_1', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
+            manipulator.add_parameter(EnumParameter('LB_1', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM']))
+            
+            manipulator.add_parameter(EnumParameter('direction_1', ['PUSH', 'PULL']))
+            manipulator.add_parameter(EnumParameter('dedup_1', ['ENABLED', 'DISABLED']))
+            manipulator.add_parameter(EnumParameter('frontier_output_1', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
+            manipulator.add_parameter(EnumParameter('pull_rep_1', ['BITMAP', 'BOOLMAP']))
+            
+            # We also choose the hybrid schedule threshold here
+            manipulator.add_parameter(IntegerParameter('threshold', 0, 1000))
+
+	
+
+        # adding new parameters for PriorityGraph (Ordered GraphIt) 
+	# Currently since delta is allowed to be configured only once for the entire program, we will make a single decision even if the schedule is hybrid
+        if self.args.tune_delta:
+            manipulator.add_parameter(IntegerParameter('delta', 1, self.args.max_delta))
+
+
+        if self.args.kernel_fusion:
+            manipulator.add_parameter(EnumParameter('kernel_fusion', ['DISABLED', 'ENABLED']))
+
+        return manipulator
+
+
+    def write_cfg_to_schedule(self, cfg):
+        #write into a schedule file the configuration
+
+        direction_0 = cfg['direction_0']
+        if self.args.tune_delta:
+            delta_0 = cfg['delta']
+        dedup_0 = cfg['dedup_0']
+        frontier_output_0 = cfg['frontier_output_0']
+        pull_rep_0 = cfg['pull_rep_0']
+        LB_0 = cfg['LB_0']
+
+        new_schedule = "schedule:\n"
+
+        new_schedule += "SimpleGPUSchedule s1;\n";
+        if LB_0 == "EDGE_ONLY" and cfg['EB_0'] == "ENABLED":
+            new_schedule += "s1.configLoadBalance(EDGE_ONLY, BLOCKED, " + str(int(int(self.args.num_vertices)/cfg['BS_0'])) + ");\n"
+            direction_0 = "PUSH"
+        else:
+            new_schedule += "s1.configLoadBalance(" + LB_0 + ");\n"
+        new_schedule += "s1.configFrontierCreation(" + frontier_output_0 + ");\n"
+        if direction_0 == "PULL":
+            new_schedule += "s1.configDirection(PULL, " + pull_rep_0 + ");\n"
+        else:
+            new_schedule += "s1.configDirection(PUSH);\n"
+        if self.args.tune_delta:
+            new_schedule += "s1.configDelta(" + str(delta_0) + ");\n"
+        new_schedule += "s1.configDeduplication(" + dedup_0 + ");\n"
+
+        if self.args.hybrid_schedule:
+            direction_1 = cfg['direction_1']
+            if self.args.tune_delta:
+                delta_1 = cfg['delta']
+            dedup_1 = cfg['dedup_1']
+            frontier_output_1 = cfg['frontier_output_1']
+            pull_rep_1 = cfg['pull_rep_1']
+            LB_1 = cfg['LB_1']
+
+            #threshold = self.args.hybrid_threshold
+            threshold = cfg['threshold']
+            
+            new_schedule += "SimpleGPUSchedule s2;\n";
+            new_schedule += "s2.configLoadBalance(" + LB_1 + ");\n"
+            new_schedule += "s2.configFrontierCreation(" + frontier_output_1 + ");\n"
+            if direction_1 == "PULL":
+                new_schedule += "s2.configDirection(PULL, " + pull_rep_1 + ");\n"
+            else:
+                new_schedule += "s2.configDirection(PUSH);\n"
+            if self.args.tune_delta:
+                new_schedule += "s2.configDelta(" + str(delta_1) + ");\n"
+            new_schedule += "s2.configDeduplication(" + dedup_1 + ");\n"
+            
+            new_schedule += "HybridGPUSchedule h1(INPUT_VERTEXSET_SIZE, " + str(threshold/1000) + ", s1, s2);\n"
+            new_schedule += "program->applyGPUSchedule(\"s0:s1\", h1);\n"
+
+        else:
+            new_schedule += "program->applyGPUSchedule(\"s0:s1\", s1);\n"
+
+
+
+        if self.args.kernel_fusion:
+            kernel_fusion = cfg['kernel_fusion']
+            new_schedule += "SimpleGPUSchedule s0;\n"
+            new_schedule += "s0.configKernelFusion(" + kernel_fusion + ");\n"
+            new_schedule += "program->applyGPUSchedule(\"s0\", s0);\n"
+
+        print (cfg)
+        #print (new_schedule)
+
+        self.new_schedule_file_name = 'schedule_0' 
+        #print (self.new_schedule_file_name)
+        f1 = open (self.new_schedule_file_name, 'w')
+        f1.write(new_schedule)
+        f1.close()
+
+    def compile(self, cfg,  id):
+        """                                                                          
+        Compile a given configuration in parallel                                    
+        """
+        try:
+            self.call_program("cp " + self.args.algo_file + " algotorun.gt")
+            return self.call_program("bash compile_gpu.sh")
+        except:
+            print ("fail to compiler .gt file")
+            self.call_program("false")
+
+
+    def parse_running_time(self, log_file_name='test.out'):
+        """Returns the elapsed time only, from the HPL output file"""
+
+        min_time = 10000
+
+        with open(log_file_name) as f:
+            content = f.readlines()
+        content = [x.strip() for x in content]
+        i = 0;
+        for line in content:
+            if line.find("elapsed time") != -1:
+                next_line = content[i+1]
+                time_str = next_line.strip()
+                time = float(time_str)
+                if time < min_time:
+                    min_time = time
+            i = i+1;
+
+        return min_time
+
+    def run_precompiled(self, desired_result, input, limit, compile_result, id):
+        """                                                                          
+        Run a compile_result from compile() sequentially and return performance      
+        """
+
+        cfg = desired_result.configuration.data
+        
+        if compile_result['returncode'] != 0:
+            print (str(compile_result))
+
+        assert compile_result['returncode'] == 0
+        try:    
+            run_cmd = "./test " + self.args.graph + " " + self.args.start_vertex + " > test.out"
+            print ("run_cmd: " + run_cmd)
+
+            # default value -1 for memory_limit translates into None (no memory upper limit)
+            # setting memory limit does not quite work yet
+            process_memory_limit = None
+            if self.args.memory_limit != -1:
+                process_memory_limit = self.args.memory_limit
+            # print ("memory limit: " + str(process_memory_limit))
+            run_result = self.call_program(run_cmd, limit=self.args.runtime_limit, memory_limit=process_memory_limit)  
+        finally:
+            pass
+	
+            #self.call_program('rm test')
+            #self.call_program('rm test.cpp')
+
+        if run_result['timeout'] == True:
+            val = self.args.runtime_limit
+        else:
+            val = self.parse_running_time();
+        
+        self.call_program('rm test.out')
+        print ("run result: " + str(run_result))
+        print ("running time: " + str(val))
+
+        if run_result['timeout'] == True:
+            print ("Timed out after " + str(self.args.runtime_limit) + " seconds")
+            return opentuner.resultsdb.models.Result(time=val)
+        elif run_result['returncode'] != 0:
+            if self.args.killed_process_report_runtime_limit == 1 and run_result['stderr'] == 'Killed\n' or True:
+                print ("process killed " + str(run_result))
+                return opentuner.resultsdb.models.Result(time=self.args.runtime_limit)
+            else:
+                print (str(run_result))
+                exit()
+        else:
+            return opentuner.resultsdb.models.Result(time=val)
+            
+        
+
+
+    def compile_and_run(self, desired_result, input, limit):
+        """                                                                          
+        Compile and run a given configuration then                                   
+        return performance                                                           
+        """
+        # print ("input graph: " + self.args.graph)
+
+        cfg = desired_result.configuration.data
+
+
+        self.write_cfg_to_schedule(cfg)
+        
+        # this pases in the id 0 for the configuration
+        compile_result = self.compile(cfg, 0)
+        # print "compile_result: " + str(compile_result)
+        return self.run_precompiled(desired_result, input, limit, compile_result, 0)
+
+
+    def save_final_config(self, configuration):
+        """called at the end of tuning"""
+        print ('Final Configuration:', configuration.data)
+        self.manipulator().save_to_file(configuration.data, self.args.final_config)
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(parents=opentuner.argparsers())
+    parser.add_argument('--graph', type=str, default="", help='the graph to tune on')
+    parser.add_argument('--start_vertex', type=str, default="0", help="Start vertex if applicable")
+
+    parser.add_argument('--algo_file', type=str, required=True, help='input algorithm file')
+    parser.add_argument('--final_config', type=str, help='Final config file', default="final_config.json")
+    parser.add_argument('--default_schedule_file', type=str, required=False, default="", help='default schedule file')
+    parser.add_argument('--runtime_limit', type=float, default=300, help='a limit on the running time of each program')
+    parser.add_argument('--max_delta', type=int, default=800000, help='maximum delta used for priority coarsening')
+    parser.add_argument('--memory_limit', type=int, default=-1,help='set memory limit on unix based systems [does not quite work yet]')    
+    parser.add_argument('--killed_process_report_runtime_limit', type=int, default=0, help='reports runtime_limit when a process is killed by the shell. 0 for disable (default), 1 for enable')
+
+    parser.add_argument('--kernel_fusion', type=bool, default=False, help='Choose if you want to also tune kernel fusion')
+    parser.add_argument('--hybrid_schedule', type=bool, default=False, help='Choose if you want to also explore hybrid schedules')
+    parser.add_argument('--edge_only', type=bool, default=False, help='Choose if you want to also enable EDGE_ONLY schedules')
+    parser.add_argument('--num_vertices', type=int, required=True, help='Supply number of vertices in the graph')
+    parser.add_argument('--tune_delta', type=bool, default=False, help='Also tune the delta parameter')
+    parser.add_argument('--hybrid_threshold', type=int, default=1000, help='Threshold value on 1000')
+
+
+    args = parser.parse_args()
+    # pass the argumetns into the tuner
+    GraphItTuner.main(args)
+    
diff --git a/autotune/run.sh b/autotune/run.sh
new file mode 100644
index 00000000..f78fb321
--- /dev/null
+++ b/autotune/run.sh
@@ -0,0 +1,28 @@
+
+export CUDA_VISIBLE_DEVICES=6
+
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-LiveJournal1.mtx --algo_file gpu_apps/sssp_delta_stepping.gt --killed_process_report_runtime_limit 1 --max_delta 100 --runtime_limit 20 --stop-after 600 --final_config=final_config_ds_livejournal.json --kernel_fusion=True --num_vertices=0
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-twitter-2010.mtx --algo_file gpu_apps/sssp_delta_stepping.gt --killed_process_report_runtime_limit 1 --max_delta 100 --runtime_limit 20 --stop-after 600 --final_config=final_config_ds_twitter.json --kernel_fusion=True --num_vertices=0
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/road_usa.weighted.mtx --algo_file gpu_apps/sssp_delta_stepping.gt --killed_process_report_runtime_limit 1 --max_delta 100000 --runtime_limit 20 --stop-after 1500 --final_config=final_config_ds_road_usa.json --kernel_fusion=True --num_vertices=0
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-LiveJournal1.mtx --algo_file gpu_apps/cc.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 10 --stop-after 600 --final_config=final_config_cc_livejournal.json --num_vertices=0
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-twitter-2010.mtx --algo_file gpu_apps/cc.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 30 --stop-after 600 --final_config=final_config_cc_twitter.json --num_vertices=0
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/road_usa.weighted.mtx --algo_file gpu_apps/cc.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after 600 --final_config=final_config_cc_road_usa.json --num_vertices=0
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-LiveJournal1.mtx --algo_file gpu_apps/pagerank.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after 600 --final_config=final_config_pr_livejournal.json --kernel_fusion=True --edge_only=True --num_vertices=4847571
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-twitter-2010.mtx --algo_file gpu_apps/pagerank.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 30 --stop-after 600 --final_config=final_config_pr_twitter.json --kernel_fusion=True --edge_only=True --num_vertices=21297772
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/road_usa.weighted.mtx --algo_file gpu_apps/pagerank.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after 600 --final_config=final_config_pr_road_usa.json --kernel_fusion=True --edge_only=True --num_vertices=23947347
+
+
+
+python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-orkut.mtx --algo_file gpu_apps/bfs.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after=36000  --final_config=final_config_bfs_orkut.json --kernel_fusion=True --num_vertices=0 --hybrid_schedule=True
+
+python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-LiveJournal1.mtx --algo_file gpu_apps/bfs.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after=3600  --final_config=final_config_bfs_livejournal.json --kernel_fusion=True --num_vertices=0 --hybrid_schedule=True --hybrid_threshold=8
+
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-twitter-2010.mtx --algo_file gpu_apps/bfs.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after 600 --final_config=final_config_bfs_twitter.json --kernel_fusion=True --num_vertices=0 --hybrid_schedule=True
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/road_usa.weighted.mtx --algo_file gpu_apps/bfs.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after 600 --final_config=final_config_bfs_road_usa.json --kernel_fusion=True --num_vertices=0 --hybrid_schedule=1
+
diff --git a/graphit_eval/eval/gpu_perf/.gitignore b/graphit_eval/eval/gpu_perf/.gitignore
new file mode 100644
index 00000000..e3eb845b
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/.gitignore
@@ -0,0 +1,2 @@
+output/*
+scratch/*
diff --git a/graphit_eval/eval/gpu_perf/inputs/bc_power.gt b/graphit_eval/eval/gpu_perf/inputs/bc_power.gt
new file mode 100644
index 00000000..0209d8cb
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/bc_power.gt
@@ -0,0 +1,126 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+
+const num_paths : vector{Vertex}(int) = 0;
+const dependences : vector{Vertex}(float) = 0;
+const visited : vector{Vertex}(bool) = false;
+
+func forward_update(src : Vertex, dst : Vertex)
+    num_paths[dst] +=  num_paths[src];
+end
+
+func visited_vertex_filter(v : Vertex) -> output : bool
+    output = (visited[v] == false);
+end
+
+func mark_visited(v : Vertex)
+    visited[v] = true;
+end
+
+func mark_unvisited(v : Vertex)
+    visited[v] = false;
+end
+
+func backward_vertex_f(v : Vertex)
+    visited[v] = true;
+    dependences[v] += 1.0 / num_paths[v];
+end
+
+func backward_update(src : Vertex, dst : Vertex)
+    dependences[dst] += dependences[src];
+end
+
+func final_vertex_f(v : Vertex)
+    if num_paths[v] != 0
+        dependences[v] = (dependences[v] - 1.0 / num_paths[v]) * num_paths[v];
+    else
+        dependences[v] = 0;
+    end
+end
+
+func reset(v : Vertex)
+    dependences[v] = 0;
+    num_paths[v] = 0;
+end
+
+
+
+
+func main()
+
+    % transposing the edges
+    var transposed_edges : edgeset{Edge}(Vertex, Vertex) = edges.transpose();
+    for trail in 0:1
+	 startTimer();
+    	 var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+    	 var start_vertex : int = atoi(argv[2]);
+
+    	 frontier.addVertex(start_vertex);
+    	 num_paths[start_vertex] = 1;
+    	 visited[start_vertex] = true;
+    	 var round : int = 0;
+    	 var frontier_list : list{vertexset{Vertex}} = new list{vertexset{Vertex}}();
+
+    	 frontier_list.insert(frontier);
+
+    	 % foward pass to propagate num_paths
+    	 while (frontier.getVertexSetSize() != 0)
+               round = round + 1;
+               #s1# var output : vertexset{Vertex} = edges.from(frontier).to(visited_vertex_filter).applyModified(forward_update, num_paths);
+	       delete frontier;
+               output.apply(mark_visited);
+               frontier_list.insert(output);
+               frontier = output;
+    	 end
+
+
+   	  % resetting the visited information for the backward pass
+    	  vertices.apply(mark_unvisited);
+
+    	  % pop off the empty frontier
+    	  frontier_list.retrieve(frontier);
+
+    	  frontier_list.retrieve(frontier);
+    	  frontier.apply(backward_vertex_f);
+    	  round = round - 1;
+
+    	  % backward pass to accumulate the dependencies
+    	  while (round > 0)
+          	#s2# transposed_edges.from(frontier).to(visited_vertex_filter).apply(backward_update);
+        	frontier_list.retrieve(frontier);
+        	frontier.apply(backward_vertex_f);
+        	round = round - 1;
+    	  end
+    	  delete frontier;
+
+    	  vertices.apply(final_vertex_f);
+	  var elapsed_time : float = stopTimer();
+          print "elapsed time: ";
+          print elapsed_time;
+	  vertices.apply(reset);
+    end
+    
+
+
+end
+
+
+schedule:
+
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	
+	SimpleGPUSchedule s2;
+	s2.configLoadBalance(TWCE);
+	s2.configDirection(PULL, BITMAP);
+	s2.configFrontierCreation(UNFUSED_BITMAP);
+
+	HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);	
+
+	program->applyGPUSchedule("s1", h1);
+	program->applyGPUSchedule("s2", h1);
+
diff --git a/graphit_eval/eval/gpu_perf/inputs/bc_road.gt b/graphit_eval/eval/gpu_perf/inputs/bc_road.gt
new file mode 100644
index 00000000..b591e9d4
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/bc_road.gt
@@ -0,0 +1,126 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+
+const num_paths : vector{Vertex}(int) = 0;
+const dependences : vector{Vertex}(float) = 0;
+const visited : vector{Vertex}(bool) = false;
+
+func forward_update(src : Vertex, dst : Vertex)
+    num_paths[dst] +=  num_paths[src];
+end
+
+func visited_vertex_filter(v : Vertex) -> output : bool
+    output = (visited[v] == false);
+end
+
+func mark_visited(v : Vertex)
+    visited[v] = true;
+end
+
+func mark_unvisited(v : Vertex)
+    visited[v] = false;
+end
+
+func backward_vertex_f(v : Vertex)
+    visited[v] = true;
+    dependences[v] += 1.0 / num_paths[v];
+end
+
+func backward_update(src : Vertex, dst : Vertex)
+    dependences[dst] += dependences[src];
+end
+
+func final_vertex_f(v : Vertex)
+    if num_paths[v] != 0
+        dependences[v] = (dependences[v] - 1.0 / num_paths[v]) * num_paths[v];
+    else
+        dependences[v] = 0;
+    end
+end
+
+func reset(v : Vertex)
+    dependences[v] = 0;
+    num_paths[v] = 0;
+end
+
+
+
+
+func main()
+
+    % transposing the edges
+    var transposed_edges : edgeset{Edge}(Vertex, Vertex) = edges.transpose();
+    for trail in 0:1
+	 startTimer();
+    	 var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+    	 var start_vertex : int = atoi(argv[2]);
+
+    	 frontier.addVertex(start_vertex);
+    	 num_paths[start_vertex] = 1;
+    	 visited[start_vertex] = true;
+    	 var round : int = 0;
+    	 var frontier_list : list{vertexset{Vertex}} = new list{vertexset{Vertex}}();
+
+    	 frontier_list.insert(frontier);
+
+    	 % foward pass to propagate num_paths
+    	 #s0# while (frontier.getVertexSetSize() != 0)
+               round = round + 1;
+               #s1# var output : vertexset{Vertex} = edges.from(frontier).to(visited_vertex_filter).applyModified(forward_update, num_paths);
+	       delete frontier;
+               output.apply(mark_visited);
+               frontier_list.insert(output);
+               frontier = output;
+    	 end
+
+
+   	  % resetting the visited information for the backward pass
+    	  vertices.apply(mark_unvisited);
+
+    	  % pop off the empty frontier
+    	  frontier_list.retrieve(frontier);
+
+    	  frontier_list.retrieve(frontier);
+    	  frontier.apply(backward_vertex_f);
+    	  round = round - 1;
+
+    	  % backward pass to accumulate the dependencies
+    	  #s2# while (round > 0)
+          	#s3# transposed_edges.from(frontier).to(visited_vertex_filter).apply(backward_update);
+        	frontier_list.retrieve(frontier);
+        	frontier.apply(backward_vertex_f);
+        	round = round - 1;
+    	  end
+    	  delete frontier;
+
+    	  vertices.apply(final_vertex_f);
+	  var elapsed_time : float = stopTimer();
+          print "elapsed time: ";
+          print elapsed_time;
+	  vertices.apply(reset);
+    end
+    
+
+
+end
+
+
+schedule:
+
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	s1.configDeduplication(ENABLED, FUSED);
+	
+
+	program->applyGPUSchedule("s0:s1", s1);
+	program->applyGPUSchedule("s2:s3", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	
+	program->applyGPUSchedule("s0", s0);
+	program->applyGPUSchedule("s2", s0);
diff --git a/graphit_eval/eval/gpu_perf/inputs/bfs_power.gt b/graphit_eval/eval/gpu_perf/inputs/bfs_power.gt
new file mode 100644
index 00000000..ee63e3a1
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/bfs_power.gt
@@ -0,0 +1,63 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+
+	s1.configDeduplication(DISABLED);
+	s1.configLoadBalance(TWCE);
+	s1.configDirection(PUSH);
+	s1.configFrontierCreation(FUSED);
+	//s1.configDeduplication(ENABLED);
+	//s1.configFrontierCreation(UNFUSED_BITMAP);
+
+
+	SimpleGPUSchedule s2 = s1;
+	s2.configLoadBalance(VERTEX_BASED);
+	s2.configDirection(PULL, BITMAP);
+	s2.configDeduplication(DISABLED);
+	s2.configFrontierCreation(UNFUSED_BITMAP);
+	
+	HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);
+	program->applyGPUSchedule("s0:s1", h1);
+
+
diff --git a/graphit_eval/eval/gpu_perf/inputs/bfs_road.gt b/graphit_eval/eval/gpu_perf/inputs/bfs_road.gt
new file mode 100644
index 00000000..b44597e2
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/bfs_road.gt
@@ -0,0 +1,56 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configDeduplication(DISABLED);
+	s1.configLoadBalance(TWCE);
+	s1.configDirection(PUSH);
+	s1.configFrontierCreation(FUSED);
+
+
+	program->applyGPUSchedule("s0:s1", s1);
+
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/graphit_eval/eval/gpu_perf/inputs/cc_power.gt b/graphit_eval/eval/gpu_perf/inputs/cc_power.gt
new file mode 100644
index 00000000..3c7cf885
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/cc_power.gt
@@ -0,0 +1,64 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+
+const vertices : vertexset{Vertex} = edges.getVertices();
+const IDs : vector{Vertex}(int) = 1;
+
+const update: vector[1](int);
+
+func updateEdge(src : Vertex, dst : Vertex)
+    var src_id: Vertex = IDs[src];
+    var dst_id: Vertex = IDs[dst];
+
+    IDs[dst_id] min= IDs[src_id];
+    IDs[src_id] min= IDs[dst_id];
+end
+
+func init(v : Vertex)
+     IDs[v] = v;
+end
+
+func pjump(v: Vertex) 
+    var y: Vertex = IDs[v];
+    var x: Vertex = IDs[y];
+    if x != y
+        IDs[v] = x;
+        update[0] = 1;
+    end
+end
+
+func main()
+    var n : int = edges.getVertices();
+    for trail in 0:10
+        var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);
+        startTimer();
+        vertices.apply(init);
+        while (frontier.getVertexSetSize() != 0)
+            #s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge,IDs);
+	    delete frontier;
+	    frontier = output;
+            update[0] = 1;
+            #s0# while update[0] != 0
+		update[0] = 0;
+		vertices.apply(pjump);
+            end
+        end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+        print "elapsed time: ";
+        print elapsed_time;
+    end
+end
+
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWC);
+	s1.configDeduplication(ENABLED);
+	s1.configFrontierCreation(UNFUSED_BITMAP);
+	program->applyGPUSchedule("s1", s1);
+
+
diff --git a/graphit_eval/eval/gpu_perf/inputs/pr_social.gt b/graphit_eval/eval/gpu_perf/inputs/pr_social.gt
new file mode 100644
index 00000000..18aea9cc
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/pr_social.gt
@@ -0,0 +1,57 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const old_rank : vector{Vertex}(double) = 1.0/vertices.size();
+const new_rank : vector{Vertex}(double) = 0.0;
+const out_degree : vector {Vertex}(int) = edges.getOutDegrees();
+const contrib : vector{Vertex}(double) = 0.0;
+const error : vector{Vertex}(double) = 0.0;
+const damp : double = 0.85;
+const beta_score : double = (1.0 - damp) / vertices.size();
+
+func computeContrib(v : Vertex)
+    contrib[v] = old_rank[v] / out_degree[v];
+end
+
+func updateEdge(src : Vertex, dst : Vertex)
+    new_rank[dst] += contrib[src];
+end
+
+func updateVertex(v : Vertex)
+    var old_score : double = old_rank[v];
+    new_rank[v] = beta_score + damp*(new_rank[v]);
+    error[v] = fabs(new_rank[v] - old_rank[v]);
+    old_rank[v] = new_rank[v];
+    new_rank[v] = 0.0;
+end
+
+func printRank(v : Vertex)
+    print old_rank[v];
+end
+
+func reset(v: Vertex)
+    old_rank[v] = 1.0/vertices.size();
+    new_rank[v] = 0.0;
+end
+
+func main()
+    for trail in 0:10
+    	startTimer();
+        vertices.apply(reset);
+    	for i in 0:20
+    	    vertices.apply(computeContrib);
+            #s1# edges.apply(updateEdge);
+            vertices.apply(updateVertex);
+    	end
+
+    	var elapsed_time : double = stopTimer();
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	program->applyGPUSchedule("s1", s1);
diff --git a/graphit_eval/eval/gpu_perf/inputs/sssp_power.gt b/graphit_eval/eval/gpu_perf/inputs/sssp_power.gt
new file mode 100644
index 00000000..de01ddca
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/sssp_power.gt
@@ -0,0 +1,46 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+const pq: priority_queue{Vertex}(int);
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+    var new_dist : int = dist[src] + weight;
+    pq.updatePriorityMin(dst, dist[dst], new_dist);
+end
+
+func printDist(v : Vertex)
+    print dist[v];
+end
+
+func reset(v: Vertex)
+    dist[v] = 2147483647;
+end
+
+func main()
+    for trail in 0:10
+        var start_vertex : int = atoi(argv[2]);
+        pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);
+        startTimer();
+        vertices.apply(reset);
+        dist[start_vertex] = 0;
+        #s0# while (pq.finished() == false)
+            var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue lowest priority nodes
+            #s1# edges.from(frontier).applyUpdatePriority(updateEdge);
+            delete frontier;
+        end
+        var elapsed_time : float = stopTimer();
+        print "elapsed time: ";
+        print elapsed_time;
+	delete pq;
+    end
+end
+
+
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(UNFUSED_BOOLMAP);
+	s1.configDelta("argv[3]");
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/graphit_eval/eval/gpu_perf/inputs/sssp_road.gt b/graphit_eval/eval/gpu_perf/inputs/sssp_road.gt
new file mode 100644
index 00000000..ce00052d
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/sssp_road.gt
@@ -0,0 +1,50 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+const pq: priority_queue{Vertex}(int);
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+    var new_dist : int = dist[src] + weight;
+    pq.updatePriorityMin(dst, dist[dst], new_dist);
+end
+
+func printDist(v : Vertex)
+    print dist[v];
+end
+
+func reset(v: Vertex)
+    dist[v] = 2147483647;
+end
+
+func main()
+    for trail in 0:10
+        var start_vertex : int = atoi(argv[2]);
+        pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);
+        startTimer();
+        vertices.apply(reset);
+        dist[start_vertex] = 0;
+        #s0# while (pq.finished() == false)
+            var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue lowest priority nodes
+            #s1# edges.from(frontier).applyUpdatePriority(updateEdge);
+            delete frontier;
+        end
+        var elapsed_time : float = stopTimer();
+        print "elapsed time: ";
+        print elapsed_time;
+	delete pq;
+    end
+end
+
+
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	s1.configDelta("argv[3]");
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/graphit_eval/eval/gpu_perf/run_tests.py b/graphit_eval/eval/gpu_perf/run_tests.py
new file mode 100644
index 00000000..42f644e8
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/run_tests.py
@@ -0,0 +1,273 @@
+import os
+import sys
+import subprocess
+import shutil
+
+NVCC_PATH="/usr/local/cuda/bin/nvcc"
+
+GRAPHIT_SRC_DIR=""
+GRAPHIT_BUILD_DIR=""
+GRAPH_DIR=""
+
+WORKING_DIR=os.path.abspath("./scratch").rstrip("/")
+
+OUTPUT_DIR=os.path.abspath("./output").rstrip("/")
+INPUTS_DIR=os.path.abspath("./inputs").rstrip("/")
+
+
+GPU_CC=""
+NUM_SM=""
+
+def get_command_output_class(command):
+	output = ""
+	if isinstance(command, list):
+		proc = subprocess.Popen(command, stdout=subprocess.PIPE)
+	else:
+		print(command)
+		proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
+	exitcode = proc.wait()
+	for line in proc.stdout.readlines():
+		if isinstance(line, bytes):
+			line = line.decode()
+		output += line.rstrip() + "\n"
+
+	proc.stdout.close()
+	return exitcode, output
+
+def get_command_output(command):
+	(exitcode, output) = get_command_output_class(command)
+	if exitcode != 0:
+		print("Error executing command:", command)
+		exit(1)
+	return output
+
+def get_gpu_prop():
+	global GPU_CC
+	global NUM_SM
+	global NVCC_PATH
+	global GRAPHIT_SRC_DIR
+	global WORKING_DIR
+
+	get_command_output(NVCC_PATH + " " + GRAPHIT_SRC_DIR + "/test/gpu_tests/test_input/obtain_gpu_cc.cu -o " + WORKING_DIR + "/obtain_gpu_cc")
+	output = get_command_output(WORKING_DIR+"/obtain_gpu_cc").strip().split("\n")
+	if len(output) != 2:
+		print("Cannot obtain GPU information")
+		exit(1)
+	GPU_CC=output[0]
+	NUM_SM=output[1]	
+
+
+def compile_and_execute(input_file, graph_name, args, output_name):
+	global GRAPHIT_SRC_DIR
+	global GRAPHIT_BUILD_DIR
+	global GRAPH_DIR
+	global WORKING_DIR
+	global OUTPUT_DIR
+	global GPU_CC
+	global NUM_SM
+	global NVCC_PATH
+	
+	nvcc_command = NVCC_PATH + " -rdc=true --use_fast_math -Xptxas \"-dlcm=ca --maxrregcount=64\" -std=c++11 -DNUM_CTA=" + str(int(NUM_SM)*2)+ " -DCTA_SIZE=512 -gencode arch=compute_" + GPU_CC + ",code=sm_"+GPU_CC	
+
+	graphit_compiler_command = "python " + GRAPHIT_BUILD_DIR + "/bin/graphitc.py -o " + WORKING_DIR+"/test_cpp.cu -f"
+
+
+	cwd = os.getcwd()
+	os.chdir(WORKING_DIR)
+	get_command_output(graphit_compiler_command + " " + input_file)
+	get_command_output(nvcc_command + " " + WORKING_DIR+"/test_cpp.cu -o " + WORKING_DIR+"/test_executable -I " + GRAPHIT_SRC_DIR+"/src/runtime_lib")
+	output = get_command_output(WORKING_DIR+"/test_executable " + graph_name + " " + args)
+	os.chdir(cwd)
+	
+	f = open(OUTPUT_DIR+"/"+output_name, "w")
+	f.write(output)
+	f.close()
+	
+
+def parse_output_file(output_name):
+	global OUTPUT_DIR
+	f = open(OUTPUT_DIR+"/"+output_name)
+	content = f.read().strip().split("\n")
+	f.close()
+	min_time = 1000000
+	for line in content:
+		try:
+			time = float(line)
+		except ValueError as verr:
+			time = -1
+		if time == -1:
+			continue
+		if time < min_time:
+			min_time = time
+	return time
+
+def create_csv(time_values, output_name):
+	global OUTPUT_DIR
+	f = open(OUTPUT_DIR+"/"+output_name, "w")
+	
+	for graph in time_values.keys():
+		f.write (graph+", " + str(time_values[graph]) + "\n")
+
+	f.close()
+
+def test_pr():
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/soc-orkut.mtx", "", "pr_OK")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/soc-twitter-2010.mtx", "", "pr_TW")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/soc-LiveJournal1.mtx", "", "pr_LJ")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/soc-sinaweibo.mtx", "", "pr_SW")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/indochina-2004.weighted.mtx", "", "pr_IC")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/hollywood-2009.weighted.mtx", "", "pr_HW")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/road_central.weighted.mtx", "", "pr_RC")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/road_usa.weighted.mtx", "", "pr_RU")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/roadNet-CA.weighted.mtx", "", "pr_RN")
+
+	time_values={}
+	time_values['OK'] = parse_output_file("pr_OK")
+	time_values['TW'] = parse_output_file("pr_TW")
+	time_values['LJ'] = parse_output_file("pr_LJ")
+	time_values['SW'] = parse_output_file("pr_SW")
+	time_values['IC'] = parse_output_file("pr_IC")
+	time_values['HW'] = parse_output_file("pr_HW")
+	time_values['RC'] = parse_output_file("pr_RC")
+	time_values['RU'] = parse_output_file("pr_RU")
+	time_values['RN'] = parse_output_file("pr_RN")
+
+	create_csv(time_values, "pr.csv")
+
+def test_sssp():
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/soc-orkut.mtx", "0 22", "sssp_OK")
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/soc-twitter-2010.mtx", "0 15", "sssp_TW")
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/soc-LiveJournal1.mtx", "0 120", "sssp_LJ")
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/soc-sinaweibo.mtx", "0 15", "sssp_SW")
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/indochina-2004.weighted.mtx", "0 15", "sssp_IC")
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/hollywood-2009.weighted.mtx", "0 15", "sssp_HW")
+	compile_and_execute(INPUTS_DIR+"/sssp_road.gt", GRAPH_DIR+"/road_central.weighted.mtx", "0 80000", "sssp_RC")
+	compile_and_execute(INPUTS_DIR+"/sssp_road.gt", GRAPH_DIR+"/road_usa.weighted.mtx", "0 80000", "sssp_RU")
+	compile_and_execute(INPUTS_DIR+"/sssp_road.gt", GRAPH_DIR+"/roadNet-CA.weighted.mtx", "0 80000", "sssp_RN")
+
+	time_values={}
+	time_values['OK'] = parse_output_file("sssp_OK")
+	time_values['TW'] = parse_output_file("sssp_TW")
+	time_values['LJ'] = parse_output_file("sssp_LJ")
+	time_values['SW'] = parse_output_file("sssp_SW")
+	time_values['IC'] = parse_output_file("sssp_IC")
+	time_values['HW'] = parse_output_file("sssp_HW")
+	time_values['RC'] = parse_output_file("sssp_RC")
+	time_values['RU'] = parse_output_file("sssp_RU")
+	time_values['RN'] = parse_output_file("sssp_RN")
+
+	create_csv(time_values, "sssp.csv")
+
+def test_cc():
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/soc-orkut.mtx", "", "cc_OK")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/soc-twitter-2010.mtx", "", "cc_TW")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/soc-LiveJournal1.mtx", "", "cc_LJ")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/soc-sinaweibo.mtx", "", "cc_SW")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/indochina-2004.weighted.mtx", "", "cc_IC")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/hollywood-2009.weighted.mtx", "", "cc_HW")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/road_central.weighted.mtx", "", "cc_RC")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/road_usa.weighted.mtx", "", "cc_RU")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/roadNet-CA.weighted.mtx", "", "cc_RN")
+
+	time_values={}
+	time_values['OK'] = parse_output_file("cc_OK")
+	time_values['TW'] = parse_output_file("cc_TW")
+	time_values['LJ'] = parse_output_file("cc_LJ")
+	time_values['SW'] = parse_output_file("cc_SW")
+	time_values['IC'] = parse_output_file("cc_IC")
+	time_values['HW'] = parse_output_file("cc_HW")
+	time_values['RC'] = parse_output_file("cc_RC")
+	time_values['RU'] = parse_output_file("cc_RU")
+	time_values['RN'] = parse_output_file("cc_RN")
+
+	create_csv(time_values, "cc.csv")
+
+def test_bfs():
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/soc-orkut.mtx", "0 0.12", "bfs_OK")
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/soc-twitter-2010.mtx", "0 0.03", "bfs_TW")
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/soc-LiveJournal1.mtx", "0 0.015", "bfs_LJ")
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/soc-sinaweibo.mtx", "0 0.012", "bfs_SW")
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/indochina-2004.weighted.mtx", "0 0.03", "bfs_IC")
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/hollywood-2009.weighted.mtx", "0 0.03", "bfs_HW")
+	compile_and_execute(INPUTS_DIR+"/bfs_road.gt", GRAPH_DIR+"/road_central.weighted.mtx", "0", "bfs_RC")
+	compile_and_execute(INPUTS_DIR+"/bfs_road.gt", GRAPH_DIR+"/road_usa.weighted.mtx", "0", "bfs_RU")
+	compile_and_execute(INPUTS_DIR+"/bfs_road.gt", GRAPH_DIR+"/roadNet-CA.weighted.mtx", "0", "bfs_RN")
+
+	time_values={}
+	time_values['OK'] = parse_output_file("bfs_OK")
+	time_values['TW'] = parse_output_file("bfs_TW")
+	time_values['LJ'] = parse_output_file("bfs_LJ")
+	time_values['SW'] = parse_output_file("bfs_SW")
+	time_values['IC'] = parse_output_file("bfs_IC")
+	time_values['HW'] = parse_output_file("bfs_HW")
+	time_values['RC'] = parse_output_file("bfs_RC")
+	time_values['RU'] = parse_output_file("bfs_RU")
+	time_values['RN'] = parse_output_file("bfs_RN")
+
+	create_csv(time_values, "bfs.csv")
+
+def test_bc():
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/soc-orkut.mtx", "0 0.12", "bc_OK")
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/soc-twitter-2010.mtx", "0 0.03", "bc_TW")
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/soc-LiveJournal1.mtx", "0 0.015", "bc_LJ")
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/soc-sinaweibo.mtx", "0 0.012", "bc_SW")
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/indochina-2004.weighted.mtx", "0 0.03", "bc_IC")
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/hollywood-2009.weighted.mtx", "0 0.03", "bc_HW")
+	compile_and_execute(INPUTS_DIR+"/bc_road.gt", GRAPH_DIR+"/road_central.weighted.mtx", "0", "bc_RC")
+	compile_and_execute(INPUTS_DIR+"/bc_road.gt", GRAPH_DIR+"/road_usa.weighted.mtx", "0", "bc_RU")
+	compile_and_execute(INPUTS_DIR+"/bc_road.gt", GRAPH_DIR+"/roadNet-CA.weighted.mtx", "0", "bc_RN")
+
+	time_values={}
+	time_values['OK'] = parse_output_file("bc_OK")
+	time_values['TW'] = parse_output_file("bc_TW")
+	time_values['LJ'] = parse_output_file("bc_LJ")
+	time_values['SW'] = parse_output_file("bc_SW")
+	time_values['IC'] = parse_output_file("bc_IC")
+	time_values['HW'] = parse_output_file("bc_HW")
+	time_values['RC'] = parse_output_file("bc_RC")
+	time_values['RU'] = parse_output_file("bc_RU")
+	time_values['RN'] = parse_output_file("bc_RN")
+
+	create_csv(time_values, "bc.csv")
+
+def run_all_tests():
+	test_pr()	
+	test_sssp()	
+	test_cc()	
+	test_bfs()	
+	test_bc()	
+
+def usage(pname):
+	print("Usage:")
+	print(pname + " <graphit_src_dir> <graphit_build_dir> <graph_directory_path>")
+
+def main():
+	global GRAPHIT_SRC_DIR
+	global GRAPHIT_BUILD_DIR
+	global GRAPH_DIR
+	global WORKING_DIR
+	global OUTPUT_DIR
+
+	if len(sys.argv) < 4:
+		usage(sys.argv[0])
+		exit(1)
+	GRAPHIT_SRC_DIR = os.path.abspath(sys.argv[1].strip()).rstrip("/")
+	GRAPHIT_BUILD_DIR = os.path.abspath(sys.argv[2].strip()).rstrip("/")
+	GRAPH_DIR = os.path.abspath(sys.argv[3].strip()).rstrip("/")
+
+
+	if os.path.isdir(WORKING_DIR):
+		shutil.rmtree(WORKING_DIR)
+	os.mkdir(WORKING_DIR)
+
+
+	if not os.path.isdir(OUTPUT_DIR):
+		os.mkdir(OUTPUT_DIR)
+
+	get_gpu_prop()	
+
+	run_all_tests()
+
+if __name__ == '__main__':
+	main()
+	
diff --git a/graphit_eval/g2_cgo2021_eval/.gitignore b/graphit_eval/g2_cgo2021_eval/.gitignore
new file mode 100644
index 00000000..1f593ec1
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/.gitignore
@@ -0,0 +1,5 @@
+fig3_outputs/*
+table7_outputs/*
+dataset/*
+!dataset/Makefile
+!dataset/local.sh
diff --git a/graphit_eval/g2_cgo2021_eval/README.md b/graphit_eval/g2_cgo2021_eval/README.md
new file mode 100644
index 00000000..87790a17
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/README.md
@@ -0,0 +1,159 @@
+# G2_artifact_eval
+
+## Introduction
+This repository is the guide for evaluating our CGO2021 paper, "Techniques for Compiling Graphs Algorithms for GPUs". This guide has steps for cloning, compiling and executing the implementation of the compiler framework G2 which is built on top of the [GraphIt DSL compiler](https://graphit-lang.org/). 
+This guide has two parts -
+  - Part 1: Reproducing Figure 3. in the paper to demonstrate that the compiler can generate very different optimized code for different schedules
+  - Part 2: Reproducing the G2 columns for all the applications and graphs from Table 7 to demonstrate the performance of the code generated from the G2 compiler
+
+Since Table 7 shows the performance numbers when run on the NVIDIA-Tesla V-100 GPU, the exact execution times you will get in Part 2 will depend on the actual GPU you use. If you do not have access to the same GPU, we have provided access to our system with this GPU in our artifact evaluation submission. 
+If you use any other GPU the schedules might have to be tuned to get the best performance for the GPU. 
+
+## Requirements
+We expect you to run the artifact evaluation on a Linux system with at least 40GBs of space. Following are the software requirements for each of the parts
+
+### Part 1
+Since part 1 only demonstrates the different code generated for different schedules, this part does *NOT* require an NVIDIA GPU or CUDA to be installed. The only software requirements are - 
+
+ - cmake (>= 3.5.1)
+ - CXX compiler (like g++ >= 5.4.0)
+ - python3 
+ - make
+ - bash
+ - git
+
+### Part 2
+Part 2 demonstrates the performance of these applications on the actual GPU. Ideally we require an NVIDIA Tesla V-100 for best results, but other NVIDIA GPUs would also work (the actual performance numbers would be different in that case). Following are the requirements besides all the requirements from Part 1 - 
+
+ - NVIDIA GPU (Pascal generation or better, preferred NVIDIA Tesla V-100 32 GB, access to our machine provided in the artifact evaluation submission). 
+ - CUDA SDK (>= 9.0)
+
+
+## How to run 
+
+### Cloning
+We will start by cloning this repository on the evaluation system using the following command - 
+
+    git clone --recursive https://github.com/AjayBrahmakshatriya/G2_artifact_eval.git
+
+If you have already cloned this repository without the `--recursive` command you can get the submodules by running the following commands. Otherwise you can directly proceed to Building G2.
+
+    git submodule init
+    git submodule update
+   
+### Building G2
+Start by navigating to the `G2_artifact_eval` directory. We will first build the G2 compiler by running the following commands from the repo's top level directory - 
+
+    cd graphit
+    mkdir build
+    cd build
+    cmake ..
+    make -j$(nproc)
+    
+If no errors are reported, the G2 compiler is built correctly and you can navigate back to the repository's top level directory and proceed to "Running Part 1"
+
+### Running Part 1
+With the G2 compiler built, you can run the Part 1 to generate the code as shown in Figure 3. You can start by returning to the top level directory of the repository with the command `cd ../../` and then running the command - 
+
+    python3 gen_fig3.py
+
+When running this command, the program will prompt for a few options like paths to where the G2 compiler is built and the output directory path. If you have followed the above steps, you can simply press enter and choose the default options shown in `[]`. 
+
+This command should take about 5 mins to run and if it doesn't report any errors, the appropriate files have been generated. Notice the above commands also prints all the commands that were executed to generate the output files. 
+
+The source files for the three schedules in Figure 3 are in the `fig3_inputs/` directory - `fig3_inputs/fig3_a.gt`, `fig3_inputs/fig3_b.gt` and `fig3_inputs/fig3_c.gt`. You can open and read them in your favorite text editor. All the three programs have the same algorithm input but different schedules a the bottom under the `schedule:` section. You can match this schedule with the one in the paper (barring some syntactic changes in the paper for brevity). 
+
+If you choose the default options while running the above programs, the outputs should be generated in the `fig3_outputs/` directory - `fig3_outputs/fig3_a.gt.cu`, `fig3_outputs/fig3_b.gt.cu` and `fig3_outputs/fig3_c.gt.cu`. Again, you can open and read them in your favorite text editor or simple `cat` them. 
+
+You can match the body of the `main` and the user defined function `updateEdges`. Again, we have changed the syntax a little in the paper for brevity. 
+
+### Obtaining the datasets
+The datasets are only required for Part 2. If you are not planning to run Part 2, you do not need to obtain the datasets.
+
+We have created two datasets for your convenience - *small* and *all*. The small dataset contains just two graphs (one with bounded degree distribution and one with power law degree distribution). Obtaining and running the small dataset should take less than 15 mins and quickly tests all the variants for all algorithms. The all dataset contains all the 9 graphs from the paper and would take much longer to run (upwards of 1.5 hours on our system). 
+
+There are two ways of obtaining the datasets. If you are running this artifact evaluation on the system we have provided access to, you can quickly fetch all the data set files by running the following commands in the top level directory - 
+
+    cd dataset
+    make local
+    
+If everything succeeds, the dataset should be soft-linked into this directory and you can verify that by running the `ls` command. You can now navigate to the top level directory using the command `cd ../` and proceed to the next step ("Running Part 2"). If the command reports the error 
+
+> You are not running this command on the right host. Please use `make dataset` instead
+
+it means that you are not running the artifact evaluation on our system and you should use the other method for downloading the datasets
+
+If you are running the artifact evaluation on your own system, the script will have to download a tar ball and extact the files. We have a separate command for *small* and *all* datasets. So if you are planning to run the evaluation only for the 2 graphs, please download only the small dataset to save time.
+
+For downloading the *all* dataset run the following commands from the top-level directory -  
+
+    cd datasets
+    make dataset
+
+To download just the *small* dataset run the following commands from the top-level directory - 
+
+    cd datasets
+    make small
+
+    
+This step will take some time, because it downloads and uncompresses all the datasets. After the command succeeds, you can verify that the files are downloaded by running the `ls` command. The small dataset is part of the all dataset and if you accidently downloaded the all dataset, you can still run the small part of the experiment. 
+
+Navigate to the top level directory in any case using the command - `cd ../`
+
+### Running Part 2
+This part evaluates the generated code for all the applications and inputs to reproduce Table 7 in the paper. A reminder that if you are running the experiments on a system with any other GPU than the NVIDIA Tesla V-100 (32 GB), the results might be different. The system we have provided with the artifact evaluation has the correct GPU. 
+
+Before we actually run the evaluation, we will list all the GPUs in the system and find one that is completely free. We need a free GPU because the performance might be hampered if other processes are running on the same GPU. 
+
+Start by running the command - 
+
+    nvidia-smi
+    
+This will list all the GPUs attached to the system numbered from 0. At the bottom of the table, there is a Processes section which shows what processes are running on which GPU. Find a GPU which doesn't have any processes running on it and note down its ID. Suppose for the purpose of this evaluation, the 4th GPU (ID: 3) is free and we want to use that. 
+
+We do not recommend running the evaluation on a GPU that is being used by other processes since it might affect the evaluation results (and correctness) a lot. 
+
+Before running the actual command for running all the experiments, make sure you have successfully built G2 and fetched the datasets. If you are planning to run the *all* dataset make sure you have downloaded the entire dataset. 
+
+To run only the small data set navigate to the top level directory of the repository and run the command - 
+
+    python3 gen_table7.py small
+
+To run the all data set navigate to the top level directory of the repository and run the command - 
+
+    python3 gen_table7.py
+    
+Again, like Part 1 the program will prompt for various options like path to the CUDA compiler, CXX compiler, path to G2 build directory and the GPU to use. Following is the description of each of the options - 
+
+- Output directory to use: This is the directory where the output of this section will be generated. Please select the default option by pressing enter (notice that the outputs from previous runs will be wiped. So if are planning to run multiple times and want to preserve old results, copy the results somewhere else). 
+- GraphIt build directory: This is the path to the `build/` directory where G2 is compiled. If you have followed the exact steps above, just choose the default by pressing enter. 
+- Dataset path: This is the directory where the datasets are fetched. If you have followed the exact steps mentioned above, just select the default by pressing enter. 
+- NVCC path: This is the path to the `nvcc` compiler from the CUDA SDK. Typically this binary is located at `/usr/local/cuda/bin/nvcc`. If you have installed it elsewhere, please provide the path here. If you have the binary in your `$PATH` variable (you can verify this by running `nvcc --version`), you can simply type `nvcc` and press enter. If you are using the system that we have provided, just press the enter key. 
+- CXX_COMPILER path: This is the path to the CXX compiler that you want to use. The default option is `/usr/bin/g++`. If you are using a different compiler, please provide the path here. If you are using the system that we have provided, just press the enter key. 
+- GPU ID to use: This is the GPU ID that you want to use to run the experiments on. We have obtained the ID of a GPU that is free in the above step. Enter that here. If the 4th GPU (ID: 3) is free, type `3` and press enter. The default option is `0`, but `0` might not be free. 
+
+Once you enter all the options, the experiments will run one after the other. The program will print which application it is currently running and how many graphs it is done evaluating on. Sit back because running all the applications can take a while (~20 mins for small dataset and >1.5 hrs for all dataset)
+
+If the program completes execution without any errors, all the experiments are done and you can view the final results in the output directory. If you chose the default option, the output file should be under `table7_outputs/table7.txt`. The program should also print the table on successful completion. 
+
+
+## Evaluating related works
+Table 7 in the paper also compares against other related works to compare the speedups we obtain with our compiler. Unfortunately the source code of some of the related works is not directly usable (we found some bugs in the systems and had to fix them ourselves). But the related work Gunrock has a system that is easy to build and evaluate - 
+
+The source code for Gunrock is available in the repository - 
+
+    https://github.com/gunrock/gunrock
+
+Building gunrock is very easy and you can simply follow the instructions under "Quick Start Guide". Gunrock also provides access to docker containers that should have all the dependences packaged. You can use those if it is more convenient. 
+
+If you successfully build gunrock, you can run the experiments with the same dataset that you obtained for G2 evaluation (mtx file format). 
+
+For example to run the pagerank application with Gunrock, the binary is located in the `build/bin/` directory. You can run it using the command - 
+
+    ./pr market /path/to/dataset/roadNet-CA.mtx --num-runs=10 --quick --device=3 --advance-mode=TWC
+
+Here `--device=3` is the GPU to run the experiments on. Change the ID to the proper GPU you want to run on. For other applications like BFS and BC some extra parameters like `do-bfs-a` and `do-bfs-b` are required. The details of all applications, their command line arguments and reference results can be found at - 
+
+    https://gunrock.github.io/docs/#/analysis/engines_topc/engines_topc_table 
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/dataset/Makefile b/graphit_eval/g2_cgo2021_eval/dataset/Makefile
new file mode 100644
index 00000000..a800bf73
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/dataset/Makefile
@@ -0,0 +1,13 @@
+small: 
+	wget "https://www.dropbox.com/s/s92m1rqjvwvkf4m/graph-dataset-small.tar.gz?dl=0" -O graph-dataset-small.tar.gz
+	tar xvf graph-dataset-small.tar.gz
+	rm graph-dataset-small.tar.gz
+
+dataset:
+	wget "https://www.dropbox.com/s/ysk7sk8yor2o71g/graph-dataset.tar.gz?dl=0" -O graph-dataset.tar.gz
+	tar xvf graph-dataset.tar.gz
+	rm graph-dataset.tar.gz
+local:
+	bash local.sh
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/dataset/local.sh b/graphit_eval/g2_cgo2021_eval/dataset/local.sh
new file mode 100644
index 00000000..c409a803
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/dataset/local.sh
@@ -0,0 +1,6 @@
+if [[ $(hostname) == "lanka-dgx0.csail.mit.edu" ]]; then
+	echo OK
+	ln -s /local/ajaybr/graph-dataset/clean_general/*.mtx .
+else
+	echo You are not running this command on the right host. Please use `make dataset` instead
+fi
diff --git a/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_a.gt b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_a.gt
new file mode 100644
index 00000000..ef741291
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_a.gt
@@ -0,0 +1,51 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+
+	s1.configDeduplication(DISABLED);
+	s1.configDirection(PUSH);
+	s1.configLoadBalance(VERTEX_BASED);
+	s1.configFrontierCreation(FUSED);
+	program->applyGPUSchedule("s0:s1", s1);
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_b.gt b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_b.gt
new file mode 100644
index 00000000..443ad1fd
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_b.gt
@@ -0,0 +1,51 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
+
diff --git a/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_c.gt b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_c.gt
new file mode 100644
index 00000000..43f2a687
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_c.gt
@@ -0,0 +1,56 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configDirection(PUSH);
+	s1.configLoadBalance(TWCE);
+
+	SimpleGPUSchedule s2 = s1;
+	s2.configLoadBalance(VERTEX_BASED);
+	s2.configDirection(PULL, BITMAP);
+	s2.configDeduplication(DISABLED);
+	s2.configFrontierCreation(UNFUSED_BITMAP);
+	
+	HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);
+	program->applyGPUSchedule("s0:s1", h1);
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/gen_fig3.py b/graphit_eval/g2_cgo2021_eval/gen_fig3.py
new file mode 100644
index 00000000..0cf6952b
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/gen_fig3.py
@@ -0,0 +1,72 @@
+import os
+import subprocess
+DIR_PATH=os.path.dirname(os.path.realpath(__file__)).rstrip("/")
+
+SCRATCH_PATH=""
+GRAPHIT_BUILD_PATH=""
+APPS_DIRECTORY=""
+
+	
+
+def read_default_path(message, default):
+	print(message + " [" + default + "]: ", end="")
+	val = input().strip().rstrip("/")
+	if val == "":
+		val = default	
+	return val
+
+def get_command_output(command):
+	output = ""
+	if isinstance(command, list):
+		proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+	else:
+		proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+	exitcode = proc.wait()
+	if exitcode != 0:
+		print(command)
+	assert(exitcode == 0)
+	for line in proc.stdout.readlines():
+		if isinstance(line, bytes):
+			line = line.decode()
+		output += line.rstrip() + "\n"
+	proc.stdout.close()
+	return output
+
+def compile_application(gtfile):
+	get_command_output("python3 " + GRAPHIT_BUILD_PATH + "/bin/graphitc.py -f " + APPS_DIRECTORY + "/" + gtfile + " -o " + gtfile + ".cu")
+	
+
+def run_tests():
+	compile_application("fig3_a.gt")
+	compile_application("fig3_b.gt")
+	compile_application("fig3_c.gt")
+	
+	os.system("rm compile.cpp compile.o")
+	
+	
+def main():
+	global SCRATCH_PATH
+	global GRAPHIT_BUILD_PATH
+	global APPS_DIRECTORY
+
+	print("Starting artifact evaluation in directory: ", DIR_PATH)
+	SCRATCH_PATH = read_default_path("Please choose a output directory to use", DIR_PATH + "/fig3_outputs")
+	GRAPHIT_BUILD_PATH = read_default_path("Please choose GraphIt build directory", DIR_PATH + "/../../build")
+	APPS_DIRECTORY = DIR_PATH+"/fig3_inputs"
+	
+	if os.path.exists(SCRATCH_PATH):
+		os.system("rm -rf " + SCRATCH_PATH)
+	os.makedirs(SCRATCH_PATH)
+	
+	os.chdir(SCRATCH_PATH)
+	
+
+	run_tests()
+
+
+
+
+
+
+if __name__ == "__main__":
+	main()
diff --git a/graphit_eval/g2_cgo2021_eval/gen_table7.py b/graphit_eval/g2_cgo2021_eval/gen_table7.py
new file mode 100644
index 00000000..d1777001
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/gen_table7.py
@@ -0,0 +1,338 @@
+import os
+import subprocess
+import sys
+DIR_PATH=os.path.dirname(os.path.realpath(__file__)).rstrip("/")
+
+SCRATCH_PATH=""
+GRAPHIT_BUILD_PATH=""
+DATASET_PATH=""
+APPS_DIRECTORY=""
+GPU_ID=""
+NVCC_PATH=""
+CXX_COMPILER=""
+NVCC_COMMAND=""
+GPU_PREFIX=""
+
+
+ORKUT=""
+TWITTER=""
+LIVEJOURNAL=""
+SINAWEIBO=""
+HOLLYWOOD=""
+INDOCHINA=""
+RUSA=""
+RCA=""
+RCENTRAL=""
+GRAPH_ALL=[]
+GRAPH_SOCIAL=[]
+GRAPH_ROAD=[]
+
+def find_dataset_files():
+	global ORKUT
+	global TWITTER
+	global LIVEJOURNAL
+	global SINAWEIBO
+	global HOLLYWOOD
+	global INDOCHINA
+	global RUSA
+	global RCA
+	global RCENTRAL
+	global GRAPH_ALL
+	global GRAPH_ROAD
+	global GRAPH_SOCIAL
+
+	ORKUT=DATASET_PATH+"/soc-orkut.mtx"
+	TWITTER=DATASET_PATH+"/soc-twitter-2010.mtx"
+	LIVEJOURNAL=DATASET_PATH+"/soc-LiveJournal1.mtx"
+	SINAWEIBO=DATASET_PATH+"/soc-sinaweibo.mtx"
+	HOLLYWOOD=DATASET_PATH+"/hollywood-2009.weighted.mtx"
+	INDOCHINA=DATASET_PATH+"/indochina-2004.weighted.mtx"
+	RUSA=DATASET_PATH+"/road_usa.weighted.mtx"
+	RCA=DATASET_PATH+"/roadNet-CA.weighted.mtx"
+	RCENTRAL=DATASET_PATH+"/road_central.weighted.mtx"
+
+	if len(sys.argv) >= 2 and sys.argv[1] == "small":	
+		GRAPH_SOCIAL=[('livejournal', LIVEJOURNAL)]
+		GRAPH_ROAD=[('rca', RCA)]
+	else:
+		GRAPH_SOCIAL=[('orkut', ORKUT), ('twitter', TWITTER), ('livejournal', LIVEJOURNAL), ('sinaweibo', SINAWEIBO), ('indochina', INDOCHINA), ('hollywood', HOLLYWOOD)]
+		GRAPH_ROAD=[('rca', RCA), ('rusa', RUSA), ('rcentral', RCENTRAL)]
+
+	GRAPH_ALL = GRAPH_SOCIAL + GRAPH_ROAD
+
+	
+
+def read_default_path(message, default):
+	print(message + " [" + default + "]: ", end="")
+	val = input().strip().rstrip("/")
+	if val == "":
+		val = default	
+	return val
+
+def get_gpu_count():
+	gpus = os.popen("nvidia-smi -L").read().strip()
+	return len(gpus.split("\n"))
+
+def get_command_output(command):
+	output = ""
+	if isinstance(command, list):
+		proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+	else:
+		proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+	exitcode = proc.wait()
+	if exitcode != 0:
+		print(command)
+	assert(exitcode == 0)
+	for line in proc.stdout.readlines():
+		if isinstance(line, bytes):
+			line = line.decode()
+		output += line.rstrip() + "\n"
+	proc.stdout.close()
+	return output
+
+def set_NVCC_COMMAND(MAX_REG=64):
+	global NVCC_COMMAND
+	
+	NVCC_COMMAND = NVCC_PATH + " -ccbin " + CXX_COMPILER + " "
+	
+	get_command_output(NVCC_COMMAND + APPS_DIRECTORY + "/obtain_gpu_cc.cu -o obtain_gpu_cc")
+	output = get_command_output(GPU_PREFIX+"./obtain_gpu_cc").split()
+
+	if len(output) != 2:
+		print ("Cannot obtain GPU information")
+		exit(-1)
+	compute_capability = output[0]
+	num_of_sm = output[1]
+
+	if MAX_REG == 64:	
+		NVCC_COMMAND += " -rdc=true -DNUM_CTA=" + str(int(num_of_sm)*2) + " -DCTA_SIZE=512 -gencode arch=compute_" + compute_capability + ",code=sm_" + compute_capability
+	elif MAX_REG == 512:
+		CTA_STYLE = (int(int(num_of_sm)/2), int(512/2))
+		NVCC_COMMAND += " -rdc=true -DNUM_CTA=" + str(CTA_STYLE[0]) + " -DCTA_SIZE=" + str(CTA_STYLE[1]) + " -gencode arch=compute_" + compute_capability + ",code=sm_" + compute_capability
+	else:
+		print("Invalid MAX_REG configuration, not supported\n")
+		exit(-1)
+
+	NVCC_COMMAND += " -std=c++11 -O3 -I " + DIR_PATH+"/../.." + "/src/runtime_lib/ -Xcompiler \"-w\" -Wno-deprecated-gpu-targets --use_fast_math -Xptxas \" -dlcm=ca --maxrregcount=" + str(MAX_REG) + "\" "
+
+
+def compile_application(gtfile, binname):
+	if os.path.exists(binname):
+		return
+	get_command_output("python3 " + GRAPHIT_BUILD_PATH + "/bin/graphitc.py -f " + APPS_DIRECTORY + "/" + gtfile + " -o " + gtfile + ".cu")
+	get_command_output(NVCC_COMMAND + gtfile + ".cu -o " + binname)
+
+
+def run_sanity_check():
+	compile_application("simple_graph_load.gt", "load")
+	get_command_output(GPU_PREFIX+"./load " + RCA)
+
+
+def compile_and_run(gtfile, binname, run_args, outputf):
+	compile_application(gtfile, binname)
+	output = get_command_output(GPU_PREFIX+"./"+binname + " " + run_args)
+	f = open(outputf, "w")
+	f.write(output)
+	f.close()
+
+
+def run_pr():
+	set_NVCC_COMMAND()
+	print("Running eval for Pagerank")
+	PR = "pr.gt"	
+	for i, (name, graph) in enumerate(GRAPH_ALL):
+		compile_and_run(PR, "pr", graph, "pr_" + name + ".out")
+		print(str(i+1) + "/" + str(len(GRAPH_ALL)))
+
+
+def run_cc():
+	set_NVCC_COMMAND()
+	print("Running eval for Connected Components")
+	CC = "cc.gt"	
+	for i, (name, graph) in enumerate(GRAPH_ALL):
+		compile_and_run(CC, "cc", graph, "cc_" + name + ".out")
+		print(str(i+1) + "/" + str(len(GRAPH_ALL)))
+
+
+def run_ds():
+	delta = {}
+	delta["orkut"] = 22
+	delta["livejournal"] = 120
+	delta["twitter"] = 15
+	delta["sinaweibo"] = 15
+	delta["hollywood"] = 15
+	delta["indochina"] = 10000
+	delta["rusa"] = 80000
+	delta["rcentral"] = 30000
+	delta["rca"] = 20000
+	
+	print ("Running eval for Delta Stepping")
+	DS_SOCIAL = "ds_social.gt"
+	DS_ROAD = "ds_road.gt"
+	set_NVCC_COMMAND()
+	for i, (name, graph) in enumerate(GRAPH_SOCIAL):
+		compile_and_run(DS_SOCIAL, "ds_social", graph + " 0 " + str(delta[name]), "ds_" + name + ".out")
+		print(str(i+1) + "/" + str(len(GRAPH_ALL)))
+
+	set_NVCC_COMMAND(512)
+	for i, (name, graph) in enumerate(GRAPH_ROAD):
+		compile_and_run(DS_ROAD, "ds_road", graph + " 0 " + str(delta[name]), "ds_" + name + ".out")
+		print(str(i+1+len(GRAPH_SOCIAL)) + "/" + str(len(GRAPH_ALL)))
+		
+def run_bc():
+	threshold = {}
+	threshold["orkut"] = 0.010
+	threshold["livejournal"] = 0.006
+	threshold["twitter"] = 0.023
+	threshold["sinaweibo"] = 0.008
+	threshold["hollywood"] = 0.026
+	threshold["indochina"] = 0.99
+	
+	print ("Running eval for Betweenness Centrality")	
+	BC_SOCIAL = "bc_social.gt"
+	BC_ROAD = "bc_road.gt"
+	set_NVCC_COMMAND()
+	for i, (name, graph) in enumerate(GRAPH_SOCIAL):
+		compile_and_run(BC_SOCIAL, "bc_social", graph + " 0 " + str(threshold[name]), "bc_" + name + ".out")
+		print(str(i+1) + "/" + str(len(GRAPH_ALL)))
+	set_NVCC_COMMAND(512)
+	for i, (name, graph) in enumerate(GRAPH_ROAD):
+		compile_and_run(BC_ROAD, "bc_road", graph + " 0", "bc_" + name + ".out")
+		print(str(i+1+len(GRAPH_SOCIAL)) + "/" + str(len(GRAPH_ALL)))
+	
+
+def run_bfs():
+	threshold = {}
+	threshold["orkut"] = 0.010
+	threshold["livejournal"] = 0.006
+	threshold["twitter"] = 0.023
+	threshold["sinaweibo"] = 0.008
+	threshold["hollywood"] = 0.026
+	threshold["indochina"] = 0.99
+	
+	print ("Running eval for Breadth First Search")	
+	BFS_SOCIAL = "bfs_social.gt"
+	BFS_ROAD = "bfs_road.gt"
+	set_NVCC_COMMAND()
+	for i, (name, graph) in enumerate(GRAPH_SOCIAL):
+		compile_and_run(BFS_SOCIAL, "bfs_social", graph + " 0 " + str(threshold[name]), "bfs_" + name + ".out")
+		print(str(i+1) + "/" + str(len(GRAPH_ALL)))
+	set_NVCC_COMMAND(512)
+	for i, (name, graph) in enumerate(GRAPH_ROAD):
+		compile_and_run(BFS_ROAD, "bfs_road", graph + " 0", "bfs_" + name + ".out")
+		print(str(i+1+len(GRAPH_SOCIAL)) + "/" + str(len(GRAPH_ALL)))
+
+
+def read_execution_time(filename):
+	try:
+		f = open(SCRATCH_PATH + "/" + filename, "r")	
+		values = f.read().strip().split("\n")
+		values = [float(val) for val in values]
+		min_val = min(values)
+		min_val = int(min_val * 100000) / 100.0
+		return min_val
+	except:
+		return -1
+   
+
+def run_tests():
+	# get the GPU properties first
+	set_NVCC_COMMAND()
+	run_sanity_check()
+	run_pr()
+	run_cc()
+	run_ds()
+	run_bc()
+	run_bfs()
+
+
+def print_cell(f, val):
+	spaces = 9 - len(str(val))
+	f.write(" " * spaces + str(val) + " |")
+
+def gen_table7():
+	short_names = {}
+	short_names["orkut"] = "OK"
+	short_names["twitter"] = "TW"
+	short_names["livejournal"] = "LJ"
+	short_names["sinaweibo"] = "SW"
+	short_names["hollywood"] = "HW"
+	short_names["indochina"] = "IC"
+	short_names["rusa"] = "RU"
+	short_names["rca"] = "RN"
+	short_names["rcentral"] = "RC"
+
+	filepath = SCRATCH_PATH + "/table7.txt"
+	f = open(filepath, "w")
+	
+	f.write("-" * 67)
+	f.write("\n")
+	f.write("|")
+	print_cell(f, "Graph")
+	print_cell(f, "PR")
+	print_cell(f, "CC")
+	print_cell(f, "BFS")
+	print_cell(f, "BC")
+	print_cell(f, "SSSP")
+	f.write("\n")
+	f.write("-" * 67)
+	f.write("\n")
+	
+	for graph, _  in GRAPH_ALL:
+		f.write("|")
+		print_cell(f, short_names[graph])
+		for app in ["pr", "cc", "bfs", "bc", "ds"]:
+			fname = app + "_" + graph + ".out"
+			val = read_execution_time(fname)
+			print_cell(f, val)
+		f.write("\n")
+	
+	f.write("-" * 67)
+	f.write("\n")
+	
+	f.close()
+	print(open(filepath, "r").read())
+	print("# This table is generated at: ", filepath)
+
+	
+def main():
+	global SCRATCH_PATH
+	global GRAPHIT_BUILD_PATH
+	global DATASET_PATH
+	global APPS_DIRECTORY
+	global GPU_ID
+	global NVCC_PATH
+	global CXX_COMPILER
+	global GPU_PREFIX
+
+	print("Starting artifact evaluation in directory: ", DIR_PATH)
+	SCRATCH_PATH = read_default_path("Please choose a output directory to use", DIR_PATH + "/table7_outputs")
+	GRAPHIT_BUILD_PATH = read_default_path("Please choose GraphIt build directory", DIR_PATH + "/../../build")
+	DATASET_PATH = read_default_path("Please choose dataset path", DIR_PATH + "/dataset")
+	APPS_DIRECTORY = DIR_PATH+"/table7_inputs"
+	NVCC_PATH = read_default_path("Please choose NVCC path", "/usr/local/cuda/bin/nvcc")
+	CXX_COMPILER = read_default_path("Please choose CXX_COMPILER", "/usr/bin/g++")
+
+	if os.path.exists(SCRATCH_PATH):
+		os.system("rm -rf " + SCRATCH_PATH)
+	os.makedirs(SCRATCH_PATH)
+
+	os.chdir(SCRATCH_PATH)
+
+
+	total_devices = get_gpu_count()
+	GPU_ID = read_default_path("Choose GPU id to use (0-" + str(total_devices-1) + ")", str(0))
+	GPU_PREFIX="CUDA_VISIBLE_DEVICES="+GPU_ID+" "
+	
+
+	find_dataset_files()
+	run_tests()
+	gen_table7()
+
+
+
+
+
+
+if __name__ == "__main__":
+	main()
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_road.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_road.gt
new file mode 100644
index 00000000..fc84b683
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_road.gt
@@ -0,0 +1,125 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+
+const num_paths : vector{Vertex}(double) = 0;
+const dependences : vector{Vertex}(float) = 0;
+const visited : vector{Vertex}(bool) = false;
+
+func forward_update(src : Vertex, dst : Vertex)
+    num_paths[dst] +=  num_paths[src];
+end
+
+func visited_vertex_filter(v : Vertex) -> output : bool
+    output = (visited[v] == false);
+end
+
+func mark_visited(v : Vertex)
+    visited[v] = true;
+end
+
+func mark_unvisited(v : Vertex)
+    visited[v] = false;
+end
+
+func backward_vertex_f(v : Vertex)
+    visited[v] = true;
+    dependences[v] += 1.0 / num_paths[v];
+end
+
+func backward_update(src : Vertex, dst : Vertex)
+    dependences[dst] += dependences[src];
+end
+
+func final_vertex_f(v : Vertex)
+    if num_paths[v] != 0
+        dependences[v] = (dependences[v] - 1.0 / num_paths[v]) * num_paths[v];
+    else
+        dependences[v] = 0;
+    end
+end
+
+func reset(v : Vertex)
+    dependences[v] = 0;
+    num_paths[v] = 0;
+    visited[v] = false;
+end
+
+
+
+
+func main()
+
+    % transposing the edges
+    var transposed_edges : edgeset{Edge}(Vertex, Vertex) = edges.transpose();
+    for trail in 0:10
+	 startTimer();
+    	 var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+    	 var start_vertex : int = atoi(argv[2]);
+
+    	 frontier.addVertex(start_vertex);
+    	 num_paths[start_vertex] = 1;
+    	 visited[start_vertex] = true;
+    	 var round : int = 0;
+    	 var frontier_list : list{vertexset{Vertex}} = new list{vertexset{Vertex}}();
+
+    	 frontier_list.insert(frontier);
+
+    	 % foward pass to propagate num_paths
+    	 #s0# while (frontier.getVertexSetSize() != 0)
+               round = round + 1;
+               #s1# var output : vertexset{Vertex} = edges.from(frontier).to(visited_vertex_filter).applyModified(forward_update, num_paths);
+	       delete frontier;
+               output.apply(mark_visited);
+               frontier_list.insert(output);
+               frontier = output;
+    	 end
+
+
+   	  % resetting the visited information for the backward pass
+    	  vertices.apply(mark_unvisited);
+
+    	  % pop off the empty frontier
+    	  frontier_list.retrieve(frontier);
+
+    	  frontier_list.retrieve(frontier);
+    	  frontier.apply(backward_vertex_f);
+    	  round = round - 1;
+
+    	  % backward pass to accumulate the dependencies
+    	  #s2# while (round > 0)
+          	#s3# transposed_edges.from(frontier).to(visited_vertex_filter).apply(backward_update);
+        	frontier_list.retrieve(frontier);
+        	frontier.apply(backward_vertex_f);
+        	round = round - 1;
+    	  end
+    	  delete frontier;
+
+    	  vertices.apply(final_vertex_f);
+	  var elapsed_time : float = stopTimer();
+          print elapsed_time;
+	  vertices.apply(reset);
+    end
+    
+
+
+end
+
+
+schedule:
+
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	s1.configDeduplication(ENABLED, FUSED);
+	
+	program->applyGPUSchedule("s0:s1", s1);
+	program->applyGPUSchedule("s2:s3", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	
+	program->applyGPUSchedule("s0", s0);
+	program->applyGPUSchedule("s2", s0);
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_social.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_social.gt
new file mode 100644
index 00000000..ae797d24
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_social.gt
@@ -0,0 +1,126 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+
+const num_paths : vector{Vertex}(double) = 0;
+const dependences : vector{Vertex}(float) = 0;
+const visited : vector{Vertex}(bool) = false;
+
+func forward_update(src : Vertex, dst : Vertex)
+    num_paths[dst] +=  num_paths[src];
+end
+
+func visited_vertex_filter(v : Vertex) -> output : bool
+    output = (visited[v] == false);
+end
+
+func mark_visited(v : Vertex)
+    visited[v] = true;
+end
+
+func mark_unvisited(v : Vertex)
+    visited[v] = false;
+end
+
+func backward_vertex_f(v : Vertex)
+    visited[v] = true;
+    dependences[v] += 1.0 / num_paths[v];
+end
+
+func backward_update(src : Vertex, dst : Vertex)
+    dependences[dst] += dependences[src];
+end
+
+func final_vertex_f(v : Vertex)
+    if num_paths[v] != 0
+        dependences[v] = (dependences[v] - 1.0 / num_paths[v]) * num_paths[v];
+    else
+        dependences[v] = 0;
+    end
+end
+
+func reset(v : Vertex)
+    dependences[v] = 0;
+    num_paths[v] = 0;
+    visited[v] = false;
+end
+
+
+
+
+func main()
+
+    % transposing the edges
+    var transposed_edges : edgeset{Edge}(Vertex, Vertex) = edges.transpose();
+    for trail in 0:10
+	 startTimer();
+    	 var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+    	 var start_vertex : int = atoi(argv[2]);
+
+    	 frontier.addVertex(start_vertex);
+    	 num_paths[start_vertex] = 1;
+    	 visited[start_vertex] = true;
+    	 var round : int = 0;
+    	 var frontier_list : list{vertexset{Vertex}} = new list{vertexset{Vertex}}();
+
+    	 frontier_list.insert(frontier);
+
+    	 % foward pass to propagate num_paths
+    	 while (frontier.getVertexSetSize() != 0)
+               round = round + 1;
+               #s1# var output : vertexset{Vertex} = edges.from(frontier).to(visited_vertex_filter).applyModified(forward_update, num_paths);
+	       delete frontier;
+               output.apply(mark_visited);
+               frontier_list.insert(output);
+               frontier = output;
+    	 end
+
+
+   	  % resetting the visited information for the backward pass
+    	  vertices.apply(mark_unvisited);
+
+    	  % pop off the empty frontier
+    	  frontier_list.retrieve(frontier);
+
+    	  frontier_list.retrieve(frontier);
+    	  frontier.apply(backward_vertex_f);
+    	  round = round - 1;
+
+    	  % backward pass to accumulate the dependencies
+    	  while (round > 0)
+          	#s2# transposed_edges.from(frontier).to(visited_vertex_filter).apply(backward_update);
+        	frontier_list.retrieve(frontier);
+        	frontier.apply(backward_vertex_f);
+        	round = round - 1;
+    	  end
+    	  delete frontier;
+
+    	  vertices.apply(final_vertex_f);
+	  var elapsed_time : float = stopTimer();
+          print elapsed_time;
+	  vertices.apply(reset);
+    end
+    
+
+
+end
+
+
+schedule:
+
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	
+	SimpleGPUSchedule s2;
+	s2.configLoadBalance(TWCE);
+	s2.configDirection(PULL, BITMAP);
+	s2.configFrontierCreation(UNFUSED_BITMAP);
+
+	HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);	
+
+	program->applyGPUSchedule("s1", h1);
+	program->applyGPUSchedule("s2", h1);
+
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_road.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_road.gt
new file mode 100644
index 00000000..af81d4d3
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_road.gt
@@ -0,0 +1,55 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configDeduplication(DISABLED);
+	s1.configLoadBalance(TWCE);
+	s1.configDirection(PUSH);
+	s1.configFrontierCreation(FUSED);
+
+
+	program->applyGPUSchedule("s0:s1", s1);
+
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_social.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_social.gt
new file mode 100644
index 00000000..7634b0ff
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_social.gt
@@ -0,0 +1,59 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+
+	s1.configDeduplication(DISABLED);
+	s1.configLoadBalance(TWCE);
+	s1.configDirection(PUSH);
+	s1.configFrontierCreation(FUSED);
+
+	SimpleGPUSchedule s2 = s1;
+	s2.configLoadBalance(VERTEX_BASED);
+	s2.configDirection(PULL, BITMAP);
+	s2.configDeduplication(DISABLED);
+	s2.configFrontierCreation(UNFUSED_BITMAP);
+	
+	HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);
+	program->applyGPUSchedule("s0:s1", h1);
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/cc.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/cc.gt
new file mode 100644
index 00000000..65af4db0
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/cc.gt
@@ -0,0 +1,63 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+
+const vertices : vertexset{Vertex} = edges.getVertices();
+const IDs : vector{Vertex}(int) = 1;
+
+const update: vector[1](int);
+
+func updateEdge(src : Vertex, dst : Vertex)
+    var src_id: Vertex = IDs[src];
+    var dst_id: Vertex = IDs[dst];
+
+    IDs[dst_id] min= IDs[src_id];
+    IDs[src_id] min= IDs[dst_id];
+end
+
+func init(v : Vertex)
+     IDs[v] = v;
+end
+
+func pjump(v: Vertex) 
+    var y: Vertex = IDs[v];
+    var x: Vertex = IDs[y];
+    if x != y
+        IDs[v] = x;
+        update[0] = 1;
+    end
+end
+
+func main()
+    var n : int = edges.getVertices();
+    for trail in 0:10
+        var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);
+        startTimer();
+        vertices.apply(init);
+        while (frontier.getVertexSetSize() != 0)
+            #s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge,IDs);
+	    delete frontier;
+	    frontier = output;
+            update[0] = 1;
+            #s0# while update[0] != 0
+		update[0] = 0;
+		vertices.apply(pjump);
+            end
+        end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+        print elapsed_time;
+    end
+end
+
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(CM);
+	s1.configDeduplication(ENABLED);
+	s1.configFrontierCreation(UNFUSED_BITMAP);
+	program->applyGPUSchedule("s1", s1);
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_road.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_road.gt
new file mode 100644
index 00000000..7e048331
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_road.gt
@@ -0,0 +1,49 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+const pq: priority_queue{Vertex}(int);
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+    var new_dist : int = dist[src] + weight;
+    pq.updatePriorityMin(dst, dist[dst], new_dist);
+end
+
+func printDist(v : Vertex)
+    print dist[v];
+end
+
+func reset(v: Vertex)
+    dist[v] = 2147483647;
+end
+
+func main()
+    for trail in 0:10
+        var start_vertex : int = atoi(argv[2]);
+        pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);
+        startTimer();
+        vertices.apply(reset);
+        dist[start_vertex] = 0;
+        #s0# while (pq.finished() == false)
+            var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue lowest priority nodes
+            #s1# edges.from(frontier).applyUpdatePriority(updateEdge);
+            delete frontier;
+        end
+        var elapsed_time : float = stopTimer();
+        print elapsed_time;
+	delete pq;
+    end
+end
+
+
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(CM);
+	s1.configFrontierCreation(FUSED);
+	s1.configDelta("argv[3]");
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_social.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_social.gt
new file mode 100644
index 00000000..c9947024
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_social.gt
@@ -0,0 +1,45 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+const pq: priority_queue{Vertex}(int);
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+    var new_dist : int = dist[src] + weight;
+    pq.updatePriorityMin(dst, dist[dst], new_dist);
+end
+
+func printDist(v : Vertex)
+    print dist[v];
+end
+
+func reset(v: Vertex)
+    dist[v] = 2147483647;
+end
+
+func main()
+    for trail in 0:10
+        var start_vertex : int = atoi(argv[2]);
+        pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);
+        startTimer();
+        vertices.apply(reset);
+        dist[start_vertex] = 0;
+        #s0# while (pq.finished() == false)
+            var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue lowest priority nodes
+            #s1# edges.from(frontier).applyUpdatePriority(updateEdge);
+            delete frontier;
+        end
+        var elapsed_time : float = stopTimer();
+        print elapsed_time;
+	delete pq;
+    end
+end
+
+
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(UNFUSED_BOOLMAP);
+	s1.configDelta("argv[3]");
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/obtain_gpu_cc.cu b/graphit_eval/g2_cgo2021_eval/table7_inputs/obtain_gpu_cc.cu
new file mode 100644
index 00000000..bdec4266
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/obtain_gpu_cc.cu
@@ -0,0 +1,31 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_runtime_api.h>
+
+int main(int argc, char *argv[]) {
+    cudaDeviceProp prop;
+    cudaError_t status;
+    int device_count;
+    int device_index = 0;
+    if (argc > 1) {
+        device_index = atoi(argv[1]);
+    }
+
+    status = cudaGetDeviceCount(&device_count);
+    if (status != cudaSuccess) {
+        fprintf(stderr,"cudaGetDeviceCount() failed: %s\n", cudaGetErrorString(status));
+        return -1;
+    }
+    if (device_index >= device_count) {
+        fprintf(stderr, "Specified device index %d exceeds the maximum (the device count on this system is %d)\n", device_index, device_count);
+        return -1;
+    }
+    status = cudaGetDeviceProperties(&prop, device_index);
+    if (status != cudaSuccess) {
+        fprintf(stderr,"cudaGetDeviceProperties() for device device_index failed: %s\n", cudaGetErrorString(status));
+        return -1;
+    }
+    int v = prop.major * 10 + prop.minor;
+    printf("%d\n", v);
+    printf("%d\n", prop.multiProcessorCount);
+}
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/pr.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/pr.gt
new file mode 100644
index 00000000..0e16007f
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/pr.gt
@@ -0,0 +1,60 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const old_rank : vector{Vertex}(float) = 1.0/vertices.size();
+const new_rank : vector{Vertex}(float) = 0.0;
+const out_degree : vector {Vertex}(int) = edges.getOutDegrees();
+const contrib : vector{Vertex}(float) = 0.0;
+const error : vector{Vertex}(float) = 0.0;
+const damp : float = 0.85;
+const beta_score : float = (1.0 - damp) / vertices.size();
+
+func computeContrib(v : Vertex)
+    contrib[v] = old_rank[v] / out_degree[v];
+end
+
+func updateEdge(src : Vertex, dst : Vertex)
+    new_rank[dst] += contrib[src];
+end
+
+func updateVertex(v : Vertex)
+    var old_score : float = old_rank[v];
+    new_rank[v] = beta_score + damp*(new_rank[v]);
+    error[v] = fabs(new_rank[v] - old_rank[v]);
+    old_rank[v] = new_rank[v];
+    new_rank[v] = 0.0;
+    
+end
+
+func printRank(v : Vertex)
+    print old_rank[v];
+end
+
+func reset(v: Vertex)
+    old_rank[v] = 1.0/vertices.size();
+    new_rank[v] = 0.0;
+end
+
+func main()
+    for trail in 0:10
+    	startTimer();
+        vertices.apply(reset);
+	for round in 0:20
+    	    vertices.apply(computeContrib);
+            #s1# edges.apply(updateEdge);
+            vertices.apply(updateVertex);	     
+    	end
+    	var elapsed_time : float = stopTimer();
+    	print elapsed_time/20.0;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configDirection(PULL);
+	s1.configLoadBalance(EDGE_ONLY, BLOCKED, 0x42000);
+	
+	program->applyGPUSchedule("s1", s1);
+
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/simple_graph_load.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/simple_graph_load.gt
new file mode 100644
index 00000000..bc50ce27
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/simple_graph_load.gt
@@ -0,0 +1,13 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex, Vertex, int) = load (argv[1]);
+
+func main()
+	#s1# print edges.getVertices();	
+end
+
+
+schedule:
+	SimpleGPUSchedule s1;
+	program->applyGPUSchedule("s1", s1);
diff --git a/include/graphit/backend/backend.h b/include/graphit/backend/backend.h
index 2bc2f80c..b712a014 100644
--- a/include/graphit/backend/backend.h
+++ b/include/graphit/backend/backend.h
@@ -8,6 +8,7 @@
 #include <graphit/midend/mir_context.h>
 #include <graphit/backend/codegen_cpp.h>
 #include <graphit/backend/codegen_python.h>
+#include <graphit/backend/codegen_gpu/codegen_gpu.h>
 
 namespace graphit {
     class Backend {
@@ -18,6 +19,7 @@ namespace graphit {
 
         int emitCPP(std::ostream &oss = std::cout, std::string module_name="");
     	int emitPython(std::ostream &oss = std::cout, std::string module_name="", std::string module_path="");
+	    int emitGPU(std::ostream &oss = std::cout, std::string module_name="", std::string module_path="");
 
     private:
         MIRContext* mir_context_;
diff --git a/include/graphit/backend/codegen_gpu/assign_function_context.h b/include/graphit/backend/codegen_gpu/assign_function_context.h
new file mode 100644
index 00000000..0b6bb309
--- /dev/null
+++ b/include/graphit/backend/codegen_gpu/assign_function_context.h
@@ -0,0 +1,30 @@
+#ifndef ASSIGN_FUNCTION_CONTEXT_H
+#define ASSIGN_FUNCTION CONTEXT_H
+
+#include <graphit/midend/mir.h>
+#include <graphit/midend/mir_visitor.h>
+#include <graphit/midend/mir_context.h>
+
+#include <iostream>
+#include <sstream>
+
+
+namespace graphit {
+class AssignFunctionContext : mir::MIRVisitor {
+	public:
+		AssignFunctionContext(MIRContext *mir_context) : mir_context_(mir_context) {
+		}
+		int assign_function_context(void);
+	protected:
+		void visit(mir::PushEdgeSetApplyExpr::Ptr);
+		void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr);
+		void visit(mir::PullEdgeSetApplyExpr::Ptr);
+		void visit(mir::VertexSetApplyExpr::Ptr);
+		void visit(mir::VertexSetWhereExpr::Ptr);
+	private:
+		MIRContext *mir_context_;
+};
+}
+
+
+#endif
diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
new file mode 100644
index 00000000..a5c97125
--- /dev/null
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -0,0 +1,233 @@
+
+#ifndef GRAPHIT_CODEGEN_GPU_H
+#define GRAPHIT_CODEGEN_GPU_H
+
+#include <graphit/midend/mir.h>
+#include <graphit/midend/mir_visitor.h>
+#include <graphit/midend/mir_context.h>
+#include <iostream>
+#include <sstream>
+#include <graphit/backend/gen_edge_apply_func_decl.h>
+#include <unordered_set>
+
+namespace graphit {
+class CodeGenGPUKernelEmitter: public mir::MIRVisitor {
+public:
+	CodeGenGPUKernelEmitter(std::ostream &input_oss, MIRContext *mir_context):
+		oss(input_oss), mir_context_(mir_context), indentLevel(0) {
+		}
+	void indent() { ++indentLevel; }
+	void dedent() { --indentLevel; }
+	void printIndent() { oss << std::string(indentLevel, '\t'); }
+
+	std::ostream &oss;
+	unsigned      indentLevel;
+
+	MIRContext * mir_context_;
+
+	void visit(mir::PushEdgeSetApplyExpr::Ptr);
+	void visit(mir::PullEdgeSetApplyExpr::Ptr);
+	void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr);
+
+	void genEdgeSetGlobalKernel(mir::EdgeSetApplyExpr::Ptr);
+
+};
+
+class CodeGenGPU: public mir::MIRVisitor{
+public:
+	CodeGenGPU(std::ostream &input_oss, MIRContext *mir_context, std::string module_name_, std::string module_path):
+		oss(input_oss), mir_context_(mir_context), module_name(module_name_) {
+			indentLevel = 0;
+			edgeset_apply_func_gen_ = new EdgesetApplyFunctionDeclGenerator(mir_context_, oss);
+		}
+
+	int genGPU();
+protected:
+
+	void indent() { ++indentLevel; }
+	void dedent() { --indentLevel; }
+	void printIndent() { oss << std::string(indentLevel, '\t'); }
+
+	std::ostream &oss;
+	std::string module_name;
+	unsigned      indentLevel;
+	MIRContext * mir_context_;
+
+private:
+	void genGlobalDeclarations(void);
+	void genIncludeStmts(void);
+	void genEdgeSets(void);
+	void genHybridThresholds(void);
+	void genFuncDecl(mir::FuncDecl::Ptr);
+
+
+	void genPropertyArrayImplementationWithInitialization(mir::VarDecl::Ptr shared_ptr);
+
+
+	void genPropertyArrayDecl(mir::VarDecl::Ptr);
+	void genScalarDecl(mir::VarDecl::Ptr);
+	void genPropertyArrayAlloca(mir::VarDecl::Ptr);
+	
+	void genFusedWhileLoop(mir::WhileStmt::Ptr);
+	virtual void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
+
+	EdgesetApplyFunctionDeclGenerator* edgeset_apply_func_gen_;
+
+	virtual std::string getBackendFunctionLabel(void) {
+		return "__device__";
+	}
+protected:
+	std::vector<mir::Var> kernel_hoisted_vars;
+	std::string current_kernel_name;
+	bool is_hoisted_var (mir::Var var) {
+		std::string var_name = var.getName();
+			
+		size_t dot_pos = var_name.find(".");
+		if (dot_pos != std::string::npos) {
+			var_name.resize(dot_pos);
+		}
+		for (auto h_var: kernel_hoisted_vars) {
+			if (h_var.getName() == var_name)
+				return true;
+		}
+		return false;
+	}
+	
+
+	void generateBinaryExpr(mir::BinaryExpr::Ptr, std::string);
+protected:
+	virtual void visit(mir::EdgeSetType::Ptr) override;
+	virtual void visit(mir::PriorityQueueType::Ptr) override;
+	virtual void visit(mir::VertexSetType::Ptr) override;
+	virtual void visit(mir::ScalarType::Ptr) override;
+	virtual void visit(mir::FuncDecl::Ptr) override;
+	virtual void visit(mir::ElementType::Ptr) override;
+	virtual void visit(mir::ExprStmt::Ptr) override;
+	virtual void visit(mir::VarExpr::Ptr) override;
+	virtual void visit(mir::AssignStmt::Ptr) override;
+
+	virtual void visit(mir::AddExpr::Ptr) override;
+	virtual void visit(mir::MulExpr::Ptr) override;
+	virtual void visit(mir::DivExpr::Ptr) override;
+	virtual void visit(mir::SubExpr::Ptr) override;
+	virtual void visit(mir::EqExpr::Ptr) override;
+	virtual void visit(mir::NegExpr::Ptr) override;
+
+	virtual void visit(mir::TensorArrayReadExpr::Ptr) override;
+	virtual void visit(mir::IntLiteral::Ptr) override;
+	virtual void visit(mir::FloatLiteral::Ptr) override;
+	virtual void visit(mir::BoolLiteral::Ptr) override;
+	virtual void visit(mir::StringLiteral::Ptr) override;
+
+
+
+	virtual void visit(mir::ReduceStmt::Ptr) override;
+	virtual void visit(mir::CompareAndSwapStmt::Ptr) override;
+
+	virtual void visit(mir::VarDecl::Ptr) override;
+
+	virtual void visit(mir::ForStmt::Ptr) override;
+	virtual void visit(mir::WhileStmt::Ptr) override;
+	virtual void visit(mir::IfStmt::Ptr) override;
+	virtual void visit(mir::PrintStmt::Ptr) override;
+	virtual void visit(mir::Call::Ptr) override;	
+
+	virtual void visit(mir::BreakStmt::Ptr) override;
+
+	virtual void visit(mir::VertexSetApplyExpr::Ptr) override;
+	virtual void visit(mir::VertexSetAllocExpr::Ptr) override;
+
+	virtual void visit(mir::VertexSetDedupExpr::Ptr) override;
+	virtual void visit(mir::HybridGPUStmt::Ptr) override;
+
+	virtual void visit(mir::EnqueueVertex::Ptr) override;
+
+        virtual void visit(mir::VertexSetWhereExpr::Ptr) override;
+
+
+	virtual void visit(mir::ListType::Ptr) override;
+	virtual void visit(mir::ListAllocExpr::Ptr) override;
+
+	void genPriorityUpdateOperator(mir::PriorityUpdateOperator::Ptr); 
+	void generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare);
+	void generateHostToDeviceCopy(mir::TensorArrayReadExpr::Ptr tare);
+
+};
+class CodeGenGPUHost: public CodeGenGPU {
+public:
+	using CodeGenGPU::CodeGenGPU;
+	using CodeGenGPU::visit;
+private:
+	virtual std::string getBackendFunctionLabel(void) {
+		return "__host__";
+	}
+	virtual void visit(mir::TensorArrayReadExpr::Ptr) override;
+	virtual void visit(mir::StmtBlock::Ptr) override;
+
+	virtual void visit(mir::Call::Ptr) override;	
+	virtual void visit(mir::PrintStmt::Ptr) override;
+	virtual void visit(mir::VarExpr::Ptr) override;
+
+
+};
+
+
+class CodeGenGPUFusedKernel: public CodeGenGPU {
+public:
+	using CodeGenGPU::CodeGenGPU;
+	using CodeGenGPU::visit;
+
+	mir::WhileStmt::Ptr current_while_stmt;
+	void insertUsedPq(mir::Var var) {
+		for (auto v: current_while_stmt->used_priority_queues) {
+			if (v.getName() == var.getName())
+				return;
+		}
+		current_while_stmt->used_priority_queues.push_back(var);
+	}
+	virtual void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr) override;
+	virtual void visit(mir::StmtBlock::Ptr) override;
+	virtual void visit(mir::AssignStmt::Ptr) override;
+	virtual void visit(mir::VarDecl::Ptr) override;
+	virtual void visit(mir::VarExpr::Ptr) override;
+	virtual void visit(mir::PrintStmt::Ptr) override;
+	virtual void visit(mir::HybridGPUStmt::Ptr) override;
+	virtual void visit(mir::VertexSetDedupExpr::Ptr) override;
+	virtual void visit(mir::VertexSetApplyExpr::Ptr) override;
+	
+	std::string var_name (std::string var) {
+		//return current_kernel_name + "_" + var;
+		return "__local_" + var;
+	}
+};
+
+class KernelVariableExtractor: public mir::MIRVisitor {
+public:
+	using mir::MIRVisitor::visit;
+	std::vector<mir::Var> hoisted_vars; 
+	std::vector<mir::VarDecl::Ptr> hoisted_decls;
+	std::vector<mir::Var> hoisted_pqs;
+
+	MIRContext *mir_context_;
+	KernelVariableExtractor(MIRContext* mir_context): mir_context_(mir_context) {
+	}
+
+	void insertVar(mir::Var var_to_insert) {
+		for (auto var: hoisted_vars)
+			if (var.getName() == var_to_insert.getName())
+				return;
+		hoisted_vars.push_back(var_to_insert);
+	}
+	void insertDecl(mir::VarDecl::Ptr decl_to_insert) {
+		hoisted_decls.push_back(decl_to_insert);
+		mir::Var var(decl_to_insert->name, decl_to_insert->type);
+		insertVar(var);
+	}
+
+	virtual void visit(mir::VarExpr::Ptr);
+	virtual void visit(mir::VarDecl::Ptr);
+	virtual void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr);
+};
+
+}
+#endif
diff --git a/include/graphit/backend/codegen_gpu/extract_read_write_set.h b/include/graphit/backend/codegen_gpu/extract_read_write_set.h
new file mode 100644
index 00000000..13ba54a1
--- /dev/null
+++ b/include/graphit/backend/codegen_gpu/extract_read_write_set.h
@@ -0,0 +1,30 @@
+#ifndef EXTRACT_READ_WRITE_H
+#define EXTRACT_READ_WRITE_H
+
+#include <graphit/midend/mir.h>
+#include <graphit/midend/mir_visitor.h>
+#include <graphit/midend/mir_context.h>
+namespace graphit {
+class ExtractReadWriteSet: public mir::MIRVisitor {
+public:
+	ExtractReadWriteSet(MIRContext *mir_context_): read_set(read_set_), write_set(write_set_), mir_context(mir_context_) {
+	}
+	const std::vector<mir::TensorArrayReadExpr::Ptr> &read_set;
+	const std::vector<mir::TensorArrayReadExpr::Ptr> &write_set;	
+	
+protected:
+	virtual void visit(mir::TensorArrayReadExpr::Ptr);
+	virtual void visit(mir::AssignStmt::Ptr);
+	virtual void visit(mir::StmtBlock::Ptr);	
+	MIRContext *mir_context;
+	
+private:
+	void add_read(mir::TensorArrayReadExpr::Ptr);
+	void add_write(mir::TensorArrayReadExpr::Ptr);
+	
+	std::vector<mir::TensorArrayReadExpr::Ptr> read_set_;
+	std::vector<mir::TensorArrayReadExpr::Ptr> write_set_;
+};
+}
+
+#endif
diff --git a/include/graphit/frontend/fir.h b/include/graphit/frontend/fir.h
index 012dea54..7c0756b2 100644
--- a/include/graphit/frontend/fir.h
+++ b/include/graphit/frontend/fir.h
@@ -2,6 +2,7 @@
 // Created by Yunming Zhang on 1/24/17.
 //
 
+
 #ifndef GRAPHIT_FIR_H
 #define GRAPHIT_FIR_H
 
diff --git a/include/graphit/frontend/gpu_schedule.h b/include/graphit/frontend/gpu_schedule.h
new file mode 100644
index 00000000..9ed0e6a1
--- /dev/null
+++ b/include/graphit/frontend/gpu_schedule.h
@@ -0,0 +1,329 @@
+//
+// Created by Ajay Brahmakshatriya 
+//
+
+#ifndef GRAPHIT_GPU_SCHEDULE
+#define GRAPHIT_GPU_SCHEDULE
+
+#include <assert.h>
+
+
+namespace graphit {
+namespace fir {
+namespace gpu_schedule {
+
+
+enum gpu_schedule_options {
+	PUSH, 
+	PULL, 
+	FUSED, 
+	UNFUSED,
+	UNFUSED_BITMAP,
+	UNFUSED_BOOLMAP,
+	ENABLED,
+	DISABLED,
+	TWC,
+	TWCE,
+	WM,
+	CM,
+	STRICT,
+	EDGE_ONLY,
+	VERTEX_BASED,
+	INPUT_VERTEXSET_SIZE,
+	BITMAP,
+	BOOLMAP,
+	BLOCKED,
+	UNBLOCKED,
+};
+
+class GPUSchedule {
+	// Abstract class has no functions for now
+public:
+	// Virtual destructor to make the class polymorphic
+	virtual ~GPUSchedule() = default;
+};
+
+class SimpleGPUSchedule: public GPUSchedule {
+
+public:
+	enum class pull_frontier_rep_type {
+		BITMAP, 
+		BOOLMAP
+	};
+	enum class direction_type {
+		DIR_PUSH, 
+		DIR_PULL
+	};
+
+	enum class frontier_creation_type {
+		FRONTIER_FUSED, 
+		UNFUSED_BITMAP,
+		UNFUSED_BOOLMAP
+	};
+
+	enum class deduplication_type {
+		DEDUP_DISABLED,
+		DEDUP_ENABLED
+	};
+	enum class deduplication_strategy_type {
+		DEDUP_FUSED,
+		DEDUP_UNFUSED
+	};
+
+	enum class load_balancing_type {
+		VERTEX_BASED,	
+		TWC, 
+		TWCE, 
+		WM, 
+		CM, 
+		STRICT,
+		EDGE_ONLY
+	};
+	
+	enum class edge_blocking_type {
+		BLOCKED,
+		UNBLOCKED
+	};
+
+	enum class kernel_fusion_type {
+		FUSION_DISABLED,
+		FUSION_ENABLED
+	};
+
+	enum class boolean_type_type {
+		BOOLMAP,
+		BITMAP
+	};
+
+private:
+public:
+	direction_type direction;
+	pull_frontier_rep_type pull_frontier_rep;
+	frontier_creation_type frontier_creation;
+	deduplication_type deduplication;
+	deduplication_strategy_type deduplication_strategy;
+	load_balancing_type load_balancing;
+	edge_blocking_type edge_blocking;
+	uint32_t edge_blocking_size;
+	kernel_fusion_type kernel_fusion;
+	boolean_type_type boolean_type;
+
+	int32_t delta;
+	
+	SimpleGPUSchedule () {
+		direction = direction_type::DIR_PUSH;
+		pull_frontier_rep = pull_frontier_rep_type::BOOLMAP;
+		frontier_creation = frontier_creation_type::FRONTIER_FUSED;
+		deduplication = deduplication_type::DEDUP_DISABLED;
+		load_balancing = load_balancing_type::VERTEX_BASED;
+		edge_blocking = edge_blocking_type::UNBLOCKED;
+		edge_blocking_size = 0;
+		kernel_fusion = kernel_fusion_type::FUSION_DISABLED;
+		delta = 1;
+		boolean_type = boolean_type_type::BOOLMAP;
+	}	
+
+public:	
+	void configDirection(enum gpu_schedule_options o, enum gpu_schedule_options r = BOOLMAP) {
+		switch(o) {
+			case PUSH:
+				direction = direction_type::DIR_PUSH;
+				break;
+			case PULL:
+				direction = direction_type::DIR_PULL;
+				switch (r) {
+					case BITMAP:
+						pull_frontier_rep = pull_frontier_rep_type::BITMAP;
+						break;
+					case BOOLMAP:
+						pull_frontier_rep = pull_frontier_rep_type::BOOLMAP;
+						break;
+					default:
+						assert(false && "Invalid option for Pull Frontier representation\n");
+						break;
+				}
+				break;
+			default:
+				assert(false && "Invalid option for configDirection");
+				break;
+		}	
+	}
+	
+	void configFrontierCreation(enum gpu_schedule_options o) {
+		switch(o) {
+			case FUSED:
+				frontier_creation = frontier_creation_type::FRONTIER_FUSED;
+				break;
+			case UNFUSED_BITMAP:
+				frontier_creation = frontier_creation_type::UNFUSED_BITMAP;
+				break;
+			case UNFUSED_BOOLMAP:
+				frontier_creation = frontier_creation_type::UNFUSED_BOOLMAP;
+				break;
+			default:
+				assert(false && "Invalid option for configFrontierCreation");	
+				break;
+		}
+	}
+
+	void configDeduplication(enum gpu_schedule_options o, enum gpu_schedule_options l = UNFUSED) {
+		switch(o) {
+			case ENABLED:
+				deduplication = deduplication_type::DEDUP_ENABLED;
+				switch (l) {
+					case FUSED:
+						deduplication_strategy = deduplication_strategy_type::DEDUP_FUSED;
+						break;
+					case UNFUSED:
+						deduplication_strategy = deduplication_strategy_type::DEDUP_UNFUSED;
+						break;
+					default:
+						assert(false && "Invalid deduplication strategy\n");
+						break;
+				}
+				break;
+			case DISABLED:
+				deduplication = deduplication_type::DEDUP_DISABLED;
+				break;
+			default:
+				assert(false && "Invalid option for configDeduplication");
+				break;
+		}
+	}
+
+	void configLoadBalance(enum gpu_schedule_options o, enum gpu_schedule_options blocking = UNBLOCKED, int32_t blocking_size = 1) {
+		switch(o) {
+			case VERTEX_BASED:
+				load_balancing = load_balancing_type::VERTEX_BASED;
+				break;
+			case TWC:
+				load_balancing = load_balancing_type::TWC;
+				break;
+			case TWCE:
+				load_balancing = load_balancing_type::TWCE;
+				break;
+			case WM:
+				load_balancing = load_balancing_type::WM;
+				break;
+			case CM:
+				load_balancing = load_balancing_type::CM;
+				break;
+			case STRICT:
+				load_balancing = load_balancing_type::STRICT;
+				break;
+			case EDGE_ONLY:
+				load_balancing = load_balancing_type::EDGE_ONLY;
+				switch (blocking) {
+					case BLOCKED:
+						edge_blocking = edge_blocking_type::BLOCKED;
+						edge_blocking_size = blocking_size;	
+						break;
+					case UNBLOCKED:
+						edge_blocking = edge_blocking_type::UNBLOCKED;
+						break;
+					default:
+						assert(false && "Invalid option for configLoadBalance");
+						break;
+				}
+				break;
+			default:
+				assert(false && "Invalid option for configLoadBalance");
+				break;
+		}
+	}
+	
+	void configKernelFusion(enum gpu_schedule_options o) {
+		switch(o) {
+			case ENABLED:
+				kernel_fusion = kernel_fusion_type::FUSION_ENABLED;
+				break;
+			case DISABLED:
+				kernel_fusion = kernel_fusion_type::FUSION_DISABLED;
+				break;
+			default:
+				assert(false && "Invalid option for configKernelFusion");
+				break;
+		}
+		
+	}
+	void configDelta(int32_t d) {
+		if (d <= 0)
+			assert(false && "Invalid option for configDelta");
+		delta = d;
+	}
+	void configDelta(const char* d) {
+		if (sscanf(d, "argv[%i]", &delta) != 1) {
+			assert(false && "Invalid option for configDelta");
+		}	
+		delta *= -1;
+	}
+	void configBooleanType(enum gpu_schedule_options o) {
+		switch(o) {
+			case BOOLMAP:
+				boolean_type = boolean_type_type::BOOLMAP;
+				break;
+			case BITMAP:
+				boolean_type = boolean_type_type::BITMAP;
+				break;
+			default:
+				assert(false && "Invalid option for configBooleanType");
+				break;
+		}
+	}
+	
+};
+
+class HybridGPUSchedule: public GPUSchedule {
+private:
+	// TODO: have separate alpha beta
+public:	
+	SimpleGPUSchedule s1;
+	SimpleGPUSchedule s2;
+	
+	float threshold;
+	int32_t argv_index;
+
+	enum class hybrid_criteria {
+		INPUT_VERTEXSET_SIZE
+	};
+	hybrid_criteria _hybrid_criteria;
+private:	
+public:	
+	HybridGPUSchedule (enum gpu_schedule_options o, float t, SimpleGPUSchedule &_s1, SimpleGPUSchedule &_s2) {
+		switch(o) {
+			case INPUT_VERTEXSET_SIZE:
+				_hybrid_criteria = hybrid_criteria::INPUT_VERTEXSET_SIZE;
+				break;
+			default:
+				assert(false && "Invalid option for HybridGPUScheduleCriteria\n");
+				break;
+		}	
+		threshold = t;
+		s1 = _s1;
+		s2 = _s2;
+	}
+	HybridGPUSchedule (enum gpu_schedule_options o, const char *t, SimpleGPUSchedule &_s1, SimpleGPUSchedule &_s2) {
+		switch (o) {
+			case INPUT_VERTEXSET_SIZE:
+				_hybrid_criteria = hybrid_criteria::INPUT_VERTEXSET_SIZE;
+				break;
+			default:
+				assert(false && "Invalid option for HybridGPUScheduleCriteria\n");
+				break;
+		}
+		s1 = _s1;
+		s2 = _s2;	
+		if (sscanf(t, "argv[%i]", &argv_index) != 1) {
+			assert(false && "Invalid threshold option\n");
+		}
+		threshold = -100;
+	}
+};
+
+
+}
+}
+}
+
+#endif
+
diff --git a/include/graphit/frontend/high_level_schedule.h b/include/graphit/frontend/high_level_schedule.h
index 9809ed9d..b1267524 100644
--- a/include/graphit/frontend/high_level_schedule.h
+++ b/include/graphit/frontend/high_level_schedule.h
@@ -16,6 +16,8 @@
 #include <map>
 #include <regex>
 
+#include <graphit/frontend/gpu_schedule.h>
+
 
 namespace graphit {
     namespace fir {
@@ -52,6 +54,14 @@ namespace graphit {
                     if (schedule_ != nullptr)
                         delete(schedule_);
                 }
+                enum class backend_selection_type {
+			CODEGEN_CPU,
+                        CODEGEN_GPU,
+
+			CODEGEN_INVALID
+                };
+                
+                backend_selection_type backend_selection = backend_selection_type::CODEGEN_CPU; 
 
                 typedef std::shared_ptr<ProgramScheduleNode> Ptr;
 
@@ -214,6 +224,34 @@ namespace graphit {
                     return  schedule_;
                 }
 
+
+		// New GPU Scheduling API
+		// We currently need two different functions to apply simple and hybrid schedules
+		// TODO: Abstract the simple and hybrid schedules into a single class
+		void applyGPUSchedule(std::string label_name, gpu_schedule::SimpleGPUSchedule &s1) {
+                	backend_selection = backend_selection_type::CODEGEN_GPU; 
+
+			if (schedule_ == nullptr)
+				schedule_ = new Schedule();
+
+			gpu_schedule::SimpleGPUSchedule *s1_copy = new gpu_schedule::SimpleGPUSchedule(s1);
+			
+			schedule_->apply_gpu_schedules[label_name] = s1_copy;
+			
+		}
+		void applyGPUSchedule(std::string label_name, gpu_schedule::HybridGPUSchedule &s2) {
+                	backend_selection = backend_selection_type::CODEGEN_GPU; 
+
+			if (schedule_ == nullptr)
+				schedule_ = new Schedule();
+
+			gpu_schedule::HybridGPUSchedule *s2_copy = new gpu_schedule::HybridGPUSchedule(s2);
+			*s2_copy = s2;
+			
+			schedule_->apply_gpu_schedules[label_name] = s2_copy;
+		}
+		
+
             private:
                 graphit::FIRContext * fir_context_;
                 Schedule * schedule_;
diff --git a/include/graphit/frontend/schedule.h b/include/graphit/frontend/schedule.h
index 52888e2f..689ef85b 100644
--- a/include/graphit/frontend/schedule.h
+++ b/include/graphit/frontend/schedule.h
@@ -8,6 +8,7 @@
 #include <string>
 #include <map>
 #include <vector>
+#include <graphit/frontend/gpu_schedule.h>
 
 namespace graphit {
 
@@ -241,6 +242,9 @@ namespace graphit {
             ~Schedule() {
                 delete physical_data_layouts;
                 delete apply_schedules;
+		for (auto s = apply_gpu_schedules.begin(); s != apply_gpu_schedules.end(); s++) {
+			delete s->second;
+		}
             }
 
             //TODO: what does it mean??
@@ -258,8 +262,8 @@ namespace graphit {
             std::map<std::string, int> *par_for_num_threads;
 
 
+	    std::map <std::string, graphit::fir::gpu_schedule::GPUSchedule*> apply_gpu_schedules;
         };
     }
 
-
 #endif //GRAPHIT_SCHEDULE_H
diff --git a/include/graphit/midend/apply_expr_lower.h b/include/graphit/midend/apply_expr_lower.h
index 9c000421..eb04fd11 100644
--- a/include/graphit/midend/apply_expr_lower.h
+++ b/include/graphit/midend/apply_expr_lower.h
@@ -36,10 +36,15 @@ namespace graphit {
             //Lowers edgeset apply expressions
             virtual void visit(mir::EdgeSetApplyExpr::Ptr edgeset_apply_expr);
             virtual void visit(mir::VertexSetApplyExpr::Ptr vertexset_apply_expr);
-
+	
+	    virtual void visit(mir::StmtBlock::Ptr stmt_block);
+	    virtual void visit(mir::VarDecl::Ptr var_decl);
+	    virtual void visit(mir::AssignStmt::Ptr assign_stmt); 
+	    virtual void visit(mir::ExprStmt::Ptr assign_stmt); 
 
             Schedule * schedule_;
             MIRContext* mir_context_;
+	    mir::Stmt::Ptr insert_after_stmt = nullptr;
         };
 
     private:
diff --git a/include/graphit/midend/atomics_op_lower.h b/include/graphit/midend/atomics_op_lower.h
index 50f18d61..acc9e56f 100644
--- a/include/graphit/midend/atomics_op_lower.h
+++ b/include/graphit/midend/atomics_op_lower.h
@@ -17,7 +17,7 @@ namespace graphit {
     class AtomicsOpLower {
 
     public:
-        AtomicsOpLower(MIRContext *mir_context) : mir_context_(mir_context) {};
+        AtomicsOpLower(MIRContext *mir_context, Schedule * s) : mir_context_(mir_context), schedule_(s){};
 
         struct ApplyExprVisitor : public mir::MIRVisitor {
             ApplyExprVisitor(MIRContext *mir_context) :
@@ -47,7 +47,7 @@ namespace graphit {
         };
 
         struct ReduceStmtLower : public mir::MIRVisitor {
-            ReduceStmtLower(MIRContext* mir_context) : mir_context_(mir_context){
+            ReduceStmtLower(MIRContext* mir_context, Schedule* s) : mir_context_(mir_context), schedule_(s){
             }
 
 
@@ -55,6 +55,7 @@ namespace graphit {
 
         private:
             MIRContext *mir_context_ = nullptr;
+            Schedule *schedule_ = nullptr;
 
         };
 
@@ -63,6 +64,7 @@ namespace graphit {
 
     private:
         MIRContext *mir_context_ = nullptr;
+        Schedule *schedule_ = nullptr;
 
 
 
diff --git a/include/graphit/midend/frontier_reuse_analysis.h b/include/graphit/midend/frontier_reuse_analysis.h
new file mode 100644
index 00000000..731c2d15
--- /dev/null
+++ b/include/graphit/midend/frontier_reuse_analysis.h
@@ -0,0 +1,38 @@
+#ifndef FRONTIER_REUSE_ANALYSIS_H
+#define FRONTIER_REUSE_ANALYSIS_H
+
+#include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
+#include <graphit/midend/field_vector_property.h>
+#include <unordered_set>
+namespace graphit {
+class FrontierReuseAnalysis {
+public:
+	MIRContext *mir_context_;	
+	FrontierReuseAnalysis (MIRContext* mir_context): mir_context_(mir_context) {
+	}	
+	struct ReuseFindingVisitor: public mir::MIRVisitor {
+		MIRContext *mir_context_;	
+		ReuseFindingVisitor(MIRContext* mir_context): mir_context_(mir_context) {
+		}
+		using mir::MIRVisitor::visit;	
+		std::vector<mir::Stmt::Ptr> to_deletes;
+		bool is_frontier_reusable(mir::StmtBlock::Ptr, int index, std::string frontier_name); 
+		virtual void visit(mir::StmtBlock::Ptr) override;
+	};
+	struct FrontierUseFinder: public mir::MIRVisitor {
+		using mir::MIRVisitor::visit;
+		bool is_used = false;
+		std::string frontier_name;
+
+		virtual void visit(mir::VarExpr::Ptr) override;
+		virtual void visit(mir::PushEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::PullEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::EdgeSetApplyExpr::Ptr) override;
+		
+	};
+	void analyze(void);
+};
+
+}
+#endif
diff --git a/include/graphit/midend/gpu_change_tracking_lower.h b/include/graphit/midend/gpu_change_tracking_lower.h
new file mode 100644
index 00000000..f6a1e078
--- /dev/null
+++ b/include/graphit/midend/gpu_change_tracking_lower.h
@@ -0,0 +1,41 @@
+#ifndef GPU_CHANGE_TRACKING_LOWER_H
+#define GPU_CHANGE_TRACKING_LOWER_H
+
+#include <graphit/midend/mir_visitor.h>
+#include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
+namespace graphit {
+class GPUChangeTrackingLower {
+public:
+	MIRContext *mir_context_;
+	Schedule *schedule_;
+	GPUChangeTrackingLower(MIRContext *mir_context, Schedule *schedule): mir_context_(mir_context), schedule_(schedule) {
+	}
+	void lower (void);
+	struct UdfArgChangeVisitor: public mir::MIRVisitor {
+		using mir::MIRVisitor::visit;
+		MIRContext *mir_context_;
+		UdfArgChangeVisitor(MIRContext *mir_context): mir_context_(mir_context) {
+		}
+		void updateUdf(mir::FuncDecl::Ptr func_decl, mir::EdgeSetApplyExpr::Ptr);
+		virtual void visit(mir::PushEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::PullEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr) override;
+	};
+
+	struct ReductionOpChangeVisitor: public mir::MIRVisitor {
+		using mir::MIRVisitor::visit;
+		MIRContext *mir_context_;
+		mir::EdgeSetApplyExpr::Ptr current_edge_set_apply_expr;
+		std::string udf_tracking_var;
+		mir::Type::Ptr frontier_type;
+		ReductionOpChangeVisitor(MIRContext *mir_context, std::string tracking_var, mir::EdgeSetApplyExpr::Ptr edge_set_apply_expr, mir::Type::Ptr type): mir_context_(mir_context), udf_tracking_var(tracking_var), current_edge_set_apply_expr(edge_set_apply_expr), frontier_type(type) {
+		}	
+		virtual void visit(mir::StmtBlock::Ptr) override;
+
+
+	};
+};
+}
+
+#endif
diff --git a/include/graphit/midend/gpu_priority_features_lowering.h b/include/graphit/midend/gpu_priority_features_lowering.h
new file mode 100644
index 00000000..8ee97ab5
--- /dev/null
+++ b/include/graphit/midend/gpu_priority_features_lowering.h
@@ -0,0 +1,57 @@
+#ifndef GPU_PRIORITY_FEATURES_LOWERING_H
+#define GPU_PRIORITY_FEATURES_LOWERING_H
+
+#include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
+#include <graphit/midend/mir_rewriter.h>
+
+namespace graphit {
+class GPUPriorityFeaturesLowering {
+public:
+	MIRContext *mir_context_;
+	Schedule *schedule_;
+	GPUPriorityFeaturesLowering(MIRContext* mir_context, Schedule* schedule): mir_context_(mir_context), schedule_(schedule) {
+	}
+	void lower(void);		
+
+
+	struct EdgeSetApplyPriorityRewriter: public mir::MIRRewriter {
+		MIRContext *mir_context_;
+		Schedule *schedule_;
+		EdgeSetApplyPriorityRewriter(MIRContext* mir_context, Schedule* schedule): mir_context_(mir_context), schedule_(schedule) {
+		}
+		
+		using mir::MIRRewriter::visit;
+		virtual void visit(mir::ExprStmt::Ptr) override;
+		
+	};
+	struct PriorityUpdateOperatorRewriter: public mir::MIRRewriter {
+		MIRContext *mir_context_;
+		mir::UpdatePriorityEdgeSetApplyExpr::Ptr puesae_;
+		PriorityUpdateOperatorRewriter(MIRContext* mir_context, mir::UpdatePriorityEdgeSetApplyExpr::Ptr puesae): mir_context_(mir_context), puesae_(puesae) {
+		}
+		using mir::MIRRewriter::visit;
+		virtual void visit(mir::Call::Ptr) override;
+		
+	};
+	struct UDFPriorityQueueFinder: public mir::MIRVisitor {
+		using mir::MIRVisitor::visit;
+		
+		MIRContext *mir_context_;
+		UDFPriorityQueueFinder(MIRContext* mir_context): mir_context_(mir_context) {
+		}
+		std::vector<mir::Var> priority_queues_used;
+		mir::Var getPriorityQueue(void);
+		void insertVar(mir::Var);
+		virtual void visit(mir::PriorityUpdateOperator::Ptr) override;
+		virtual void visit(mir::PriorityUpdateOperatorMin::Ptr) override;
+		virtual void visit(mir::PriorityUpdateOperatorSum::Ptr) override;
+		virtual void visit(mir::Call::Ptr) override;
+	};
+};
+}
+
+#endif
+
+
+
diff --git a/include/graphit/midend/gpu_vector_field_properties_analyzer.h b/include/graphit/midend/gpu_vector_field_properties_analyzer.h
new file mode 100644
index 00000000..91d2fbfe
--- /dev/null
+++ b/include/graphit/midend/gpu_vector_field_properties_analyzer.h
@@ -0,0 +1,50 @@
+#ifndef GPU_VECTOR_FIELD_PROPERTIES_ANALYZER_H
+#define GPU_VECTOR_FIELD_PROPERTIES_ANALYZER_H
+
+#include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
+#include <graphit/midend/field_vector_property.h>
+#include <unordered_set>
+namespace graphit {
+
+class GPUVectorFieldPropertiesAnalyzer {
+	struct PropertyAnalyzingVisitor: public mir::MIRVisitor {
+		MIRContext* mir_context_;
+
+		std::unordered_set<std::string> independent_variables;
+		mir::FuncDecl::Ptr enclosing_function;
+
+		PropertyAnalyzingVisitor(MIRContext* mir_context, std::unordered_set<std::string> idp, mir::FuncDecl::Ptr ef): mir_context_(mir_context), independent_variables(idp), enclosing_function(ef) {
+		}
+		
+		using mir::MIRVisitor::visit;
+
+		bool is_independent_index(mir::Expr::Ptr);	
+
+		virtual void visit(mir::TensorReadExpr::Ptr) override;	
+		virtual void visit(mir::AssignStmt::Ptr) override;
+		virtual void visit(mir::ReduceStmt::Ptr) override;
+		
+		virtual void visit(mir::PriorityUpdateOperatorMin::Ptr) override;
+		
+	};
+	struct ApplyExprVisitor: public mir::MIRVisitor {
+		MIRContext* mir_context_;
+		ApplyExprVisitor(MIRContext* mir_context): mir_context_(mir_context) {
+		}
+		using mir::MIRVisitor::visit;
+		virtual void visit(mir::PushEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::PullEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr) override;
+	};
+
+	MIRContext* mir_context_;
+public:
+	void analyze(void);
+	GPUVectorFieldPropertiesAnalyzer(MIRContext* mir_context, Schedule* schedule): mir_context_(mir_context) {
+	}
+};
+
+}
+#endif
+
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index c9596273..e5c9d200 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -11,10 +11,13 @@
 #include <iostream>
 #include <unordered_set>
 #include <graphit/midend/mir_visitor.h>
+#include <graphit/midend/mir_metadata.h>
 #include <graphit/midend/var.h>
 #include <assert.h>
 #include <graphit/midend/field_vector_property.h>
 #include <unordered_map>
+#include <graphit/frontend/gpu_schedule.h>
+
 #include <graphit/frontend/schedule.h>
 
 namespace graphit {
@@ -53,6 +56,8 @@ namespace graphit {
                 return to<T>(cloneNode());
             }
 
+            // We use a single map to hold all metadata on the MIR Node
+            std::unordered_map<std::string, std::shared_ptr<MIRMetadata>> metadata_map;
         protected:
             template<typename T = MIRNode>
             std::shared_ptr<T> self() {
@@ -67,6 +72,41 @@ namespace graphit {
                 // as I slowly add in support for copy functionalities
                 return nullptr;
             };
+        public:
+            // Functions to set and retrieve metadata of different types
+            template<typename T>
+            void setMetadata(std::string mdname, T val) {
+                typename MIRMetadataImpl<T>::Ptr mdnode = std::make_shared<MIRMetadataImpl<T>>(val);
+                metadata_map[mdname] = mdnode;
+            }
+            // This function is safe to be called even if the metadata with
+            // the specified name doesn't exist
+            template<typename T>
+            bool hasMetadata(std::string mdname) {
+                if (metadata_map.find(mdname) == metadata_map.end())
+		    return false;
+                typename MIRMetadata::Ptr mdnode = metadata_map[mdname];
+                if (!mdnode->isa<T>())
+                    return false;
+                return true;
+            }
+            // This function should be called only after confirming that the 
+            // metadata with the given name exists
+            template <typename T>
+            T getMetadata(std::string mdname) {
+                assert(hasMetadata<T>(mdname));
+                typename MIRMetadata::Ptr mdnode = metadata_map[mdname];
+                return mdnode->to<T>()->val;
+            } 
+            std::unordered_map<std::string, std::shared_ptr<MIRMetadata>> cloneMetadata(void) {
+                std::unordered_map<std::string, std::shared_ptr<MIRMetadata>> new_map;
+                for (auto iter = metadata_map.begin(); iter != metadata_map.end(); iter++) {
+                    auto key = iter->first;
+                    new_map[key] = metadata_map[key]->clone();
+                }
+               	return new_map;
+            }
+            
         };
 
         struct Expr : public MIRNode {
@@ -224,6 +264,11 @@ namespace graphit {
                 visitor->visit(self<ScalarType>());
             }
 
+	    enum class BoolType {
+		BYTE, BIT
+	    };
+	    BoolType bool_type;
+
             std::string toString(){
                 std::string output_str = "";
                 if (type == mir::ScalarType::Type::FLOAT){
@@ -434,6 +479,12 @@ namespace graphit {
 
             typedef std::shared_ptr<WhileStmt> Ptr;
 
+	    bool is_fused;
+	    std::string fused_kernel_name;
+	    std::vector<mir::Var> hoisted_vars;
+	    std::vector<std::shared_ptr<mir::VarDecl>> hoisted_decls;
+	    std::vector<mir::Var> used_priority_queues;
+
             virtual void accept(MIRVisitor *visitor) {
                 visitor->visit(self<WhileStmt>());
             }
@@ -614,6 +665,14 @@ namespace graphit {
             enum class Type {
                 INTERNAL, EXPORTED, EXTERNAL
             };
+	    enum function_context_type {
+		CONTEXT_NONE = 0x0,
+		CONTEXT_HOST = 0x1,
+		CONTEXT_DEVICE = 0x2,
+		CONTEXT_BOTH = 0x3,
+	    };
+
+	    enum function_context_type function_context = function_context_type::CONTEXT_HOST;
 
             std::string name;
             std::vector<mir::Var> functorArgs;
@@ -625,6 +684,7 @@ namespace graphit {
 
             //TODO: replace this with a statement
             StmtBlock::Ptr body;
+	
 
             typedef std::shared_ptr<FuncDecl> Ptr;
 
@@ -638,6 +698,17 @@ namespace graphit {
 
             virtual MIRNode::Ptr cloneNode();
         };
+	static inline FuncDecl::function_context_type operator | (FuncDecl::function_context_type a, FuncDecl::function_context_type b) {
+		return static_cast<FuncDecl::function_context_type>((int)a | (int)b);
+	}
+	static inline FuncDecl::function_context_type operator & (FuncDecl::function_context_type a, FuncDecl::function_context_type b) {
+		return static_cast<FuncDecl::function_context_type>((int)a & (int)b);
+	}
+	static inline FuncDecl::function_context_type& operator |= (FuncDecl::function_context_type &a, FuncDecl::function_context_type b) {
+		a = a | b;
+		return a;
+	}
+	
 
 
         struct FuncExpr : public Expr {
@@ -769,6 +840,8 @@ namespace graphit {
             std::string tracking_var;
             bool is_atomic = false;
 
+	    std::shared_ptr<UpdatePriorityEdgeSetApplyExpr> edgeset_apply_expr;
+
             typedef std::shared_ptr<PriorityUpdateOperator> Ptr;
 
             virtual void accept(MIRVisitor *visitor) {
@@ -918,6 +991,12 @@ namespace graphit {
             std::string tracking_field = "";
             typedef std::shared_ptr<ApplyExpr> Ptr;
 
+	    std::string device_function;
+	    std::string kernel_function;
+	
+	    fir::gpu_schedule::SimpleGPUSchedule applied_schedule;
+	    bool requires_output = false;
+
         protected:
             virtual void copy(MIRNode::Ptr);
 
@@ -981,7 +1060,8 @@ namespace graphit {
             std::string scope_label_name;
             MergeReduceField::Ptr merge_reduce;
 
-
+	    bool fused_dedup = false;
+	    bool fused_dedup_perfect = false;
             bool frontier_reusable = false;
 	
             std::string edgeset_apply_func_name;
@@ -1016,7 +1096,12 @@ namespace graphit {
                 is_weighted = edgeset_apply->is_weighted;
                 is_parallel = edgeset_apply->is_parallel;
                 enable_deduplication = edgeset_apply->enable_deduplication;
-                frontier_reusable = edgeset_apply->frontier_reusable;
+		
+		applied_schedule = edgeset_apply->applied_schedule;
+		frontier_reusable = edgeset_apply->frontier_reusable;
+		requires_output = edgeset_apply->requires_output;
+		fused_dedup = edgeset_apply->fused_dedup;
+		fused_dedup_perfect = edgeset_apply->fused_dedup_perfect;
             }
 
             virtual void accept(MIRVisitor *visitor) {
@@ -1043,7 +1128,11 @@ namespace graphit {
                 is_weighted = edgeset_apply->is_weighted;
                 is_parallel = edgeset_apply->is_parallel;
                 enable_deduplication = edgeset_apply->enable_deduplication;
-                frontier_reusable = edgeset_apply->frontier_reusable;
+		applied_schedule = edgeset_apply->applied_schedule;
+		frontier_reusable = edgeset_apply->frontier_reusable;
+		requires_output = edgeset_apply->requires_output;
+		fused_dedup = edgeset_apply->fused_dedup;
+		fused_dedup_perfect = edgeset_apply->fused_dedup_perfect;
             }
 
             virtual void accept(MIRVisitor *visitor) {
@@ -1465,6 +1554,7 @@ namespace graphit {
             typedef std::shared_ptr<UpdatePriorityEdgeSetApplyExpr> Ptr;
 
             UpdatePriorityEdgeSetApplyExpr() {}
+	    mir::Var priority_queue_used;
 
             UpdatePriorityEdgeSetApplyExpr(EdgeSetApplyExpr::Ptr edgeset_apply) {
                 target = edgeset_apply->target;
@@ -1599,6 +1689,52 @@ namespace graphit {
 
         };
 
+
+	// GPU Specific operators
+	struct VertexSetDedupExpr: Expr {
+		Expr::Ptr target;
+		bool perfect_dedup;
+		typedef std::shared_ptr<VertexSetDedupExpr> Ptr;
+		virtual void accept(MIRVisitor *visitor) {
+			visitor->visit(self<VertexSetDedupExpr>());
+		}
+		protected:
+		virtual void copy(MIRNode::Ptr);
+		virtual MIRNode::Ptr cloneNode();		
+	};
+	struct HybridGPUStmt: Stmt {
+		StmtBlock::Ptr stmt1;
+		StmtBlock::Ptr stmt2;
+		float threshold;
+		int32_t argv_index;
+		std::string threshold_var_name;
+		fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria criteria;
+			
+		std::string input_frontier_name;
+
+		typedef std::shared_ptr<HybridGPUStmt> Ptr;
+		virtual void accept(MIRVisitor *visitor) {
+			visitor->visit(self<HybridGPUStmt>());
+		}
+	protected:
+		virtual void copy(MIRNode::Ptr);
+		virtual MIRNode::Ptr cloneNode();
+	};
+	struct EnqueueVertex: Stmt {
+		Expr::Ptr vertex_id;
+		Expr::Ptr vertex_frontier;
+		bool fused_dedup;
+		bool fused_dedup_perfect;
+		enum class Type {SPARSE, BOOLMAP, BITMAP};
+		Type type;
+		typedef std::shared_ptr<EnqueueVertex> Ptr;
+		virtual void accept(MIRVisitor *visitor) {
+			visitor->visit(self<EnqueueVertex>());
+		}
+	protected:
+		virtual void copy(MIRNode::Ptr);
+		virtual MIRNode::Ptr cloneNode();
+	};
     }
 
 }
diff --git a/include/graphit/midend/mir_context.h b/include/graphit/midend/mir_context.h
index c367905d..acde1bcb 100644
--- a/include/graphit/midend/mir_context.h
+++ b/include/graphit/midend/mir_context.h
@@ -152,6 +152,7 @@ namespace graphit {
         std::vector<mir::VarDecl::Ptr> getEdgeSets() {
             return const_edge_sets_;
         }
+	
 
         mir::VarDecl::Ptr getConstEdgeSetByName(std::string var_name) {
 
@@ -168,6 +169,34 @@ namespace graphit {
             }
             return false;
         }
+        bool isLoweredConstTensor(std::string var_name) {
+            for (auto tensor: lowered_constants_) {
+		if (tensor->name == var_name) 
+			return true;
+	    }
+	    return false;
+	}
+	bool isLoweredConst(std::string var_name) {
+		size_t dot_pos = var_name.find(".");
+		if (dot_pos != std::string::npos) {
+			var_name.resize(dot_pos);
+		}
+		
+		for (auto var: lowered_constants_) {
+			if (var->name == var_name)
+				return true;	
+		}
+		for (auto var: const_edge_sets_) {
+			if (var->name == var_name)
+				return true;
+		}
+		for (auto var: const_priority_queues_) {
+			if (var->name == var_name) 
+				return true;
+		}
+		
+		return false;
+	}
 
         void addConstVertexSet(mir::VarDecl::Ptr vertexset) {
             const_vertex_sets_.push_back(vertexset);
@@ -250,6 +279,21 @@ namespace graphit {
             }
         }
 
+	mir::VarDecl::Ptr getEdgeSetFromElementType(mir::ElementType::Ptr element_type) {
+		for (auto decl: getEdgeSets()) {
+			mir::Type::Ptr type = decl->type;
+			assert(mir::isa<mir::EdgeSetType>(type));
+			mir::EdgeSetType::Ptr edge_set_type = mir::to<mir::EdgeSetType>(type);
+			if (edge_set_type->vertex_element_type_list == nullptr)
+				continue;
+			if (edge_set_type->vertex_element_type_list->size() !=2)
+				continue;
+			if ((*(edge_set_type->vertex_element_type_list))[0]->ident == element_type->ident && (*(edge_set_type->vertex_element_type_list))[1]->ident == element_type->ident)
+				return decl; 
+		}
+		return nullptr;	
+	}
+
         bool updateElementInputFilename(mir::ElementType::Ptr element_type, mir::Expr::Ptr file_name) {
             input_filename_map_[element_type->ident] = file_name;
             return true;
@@ -347,6 +391,7 @@ namespace graphit {
         // These are global sets that are loaded from outside sources and cannot be modified
         std::vector<mir::VarDecl::Ptr> const_vertex_sets_;
         std::vector<mir::VarDecl::Ptr> const_edge_sets_;
+	std::vector<mir::VarDecl::Ptr> const_priority_queues_;
 
         //maps a vector to the Element it is associated with;
         std::map<std::string, mir::ElementType::Ptr> vector_set_element_type_map_;
@@ -416,6 +461,13 @@ namespace graphit {
         std::vector<mir::Type::Ptr> types_requiring_typedef;
 
 
+	// Used by kernel fusion optimization
+	std::vector<mir::WhileStmt::Ptr> fused_while_loops;
+	std::vector<mir::HybridGPUStmt::Ptr> hybrid_gpu_stmts;
+	
+	// Used by blocking optimization
+	std::unordered_map<std::string, uint32_t> graphs_with_blocking;
+	std::unordered_map<std::string, bool> graphs_with_transpose;
     };
 
 }
diff --git a/include/graphit/midend/mir_metadata.h b/include/graphit/midend/mir_metadata.h
new file mode 100644
index 00000000..fed5eed7
--- /dev/null
+++ b/include/graphit/midend/mir_metadata.h
@@ -0,0 +1,53 @@
+#ifndef MIR_METADATA_H
+#define MIR_METADATA_H
+
+#include <memory>
+#include <cassert>
+namespace graphit {
+namespace mir {
+
+template<typename T>
+class MIRMetadataImpl;
+
+// The abstract class for the mir metadata
+// Different templated metadata types inherit from this type
+class MIRMetadata: public std::enable_shared_from_this<MIRMetadata> {
+public:
+	typedef std::shared_ptr<MIRMetadata> Ptr;
+	virtual ~MIRMetadata() = default;
+
+
+	template <typename T>
+	bool isa (void) {
+		if(std::dynamic_pointer_cast<MIRMetadataImpl<T>>(shared_from_this()))
+			return true;
+		return false;
+	}
+	template <typename T>
+	typename MIRMetadataImpl<T>::Ptr to(void) {
+		typename MIRMetadataImpl<T>::Ptr ret = std::dynamic_pointer_cast<MIRMetadataImpl<T>>(shared_from_this());
+		assert(ret != nullptr);
+		return ret;
+	}
+	virtual MIRMetadata::Ptr clone(void) {
+		return shared_from_this();
+	}
+};
+
+// Templated metadata class for each type
+template<typename T>
+class MIRMetadataImpl: public MIRMetadata {
+public:
+	typedef std::shared_ptr<MIRMetadataImpl<T>> Ptr;
+	T val;	
+	MIRMetadataImpl(T _val): val(_val) {
+	}
+	MIRMetadata::Ptr clone(void) {
+		Ptr new_md = std::make_shared<MIRMetadataImpl<T>>(*this);
+		return 	new_md;
+	}
+};
+
+}
+}
+#endif
diff --git a/include/graphit/midend/mir_rewriter.h b/include/graphit/midend/mir_rewriter.h
index e8c28388..783a7b04 100644
--- a/include/graphit/midend/mir_rewriter.h
+++ b/include/graphit/midend/mir_rewriter.h
@@ -161,6 +161,11 @@ namespace graphit {
             virtual void visit(std::shared_ptr<PriorityUpdateOperatorSum>);
 
 	    virtual void visit(std::shared_ptr<UpdatePriorityEdgeCountEdgeSetApplyExpr>);
+	
+	    // GPU Additions
+	    virtual void visit(std::shared_ptr<VertexSetDedupExpr>);
+	    virtual void visit(std::shared_ptr<HybridGPUStmt>);
+	    virtual void visit(std::shared_ptr<EnqueueVertex>); 
 
             template<typename T = Program>
             std::shared_ptr<T> rewrite(std::shared_ptr<T> ptr) {
diff --git a/include/graphit/midend/mir_visitor.h b/include/graphit/midend/mir_visitor.h
index 1ef43fab..af9e0fe9 100644
--- a/include/graphit/midend/mir_visitor.h
+++ b/include/graphit/midend/mir_visitor.h
@@ -110,15 +110,20 @@ namespace graphit {
         struct UpdatePriorityEdgeSetApplyExpr;
         struct UpdatePriorityExternVertexSetApplyExpr;
         struct UpdatePriorityUpdateBucketsCall;
-	    struct UpdatePriorityExternCall;
+	struct UpdatePriorityExternCall;
 
-	    struct OrderedProcessingOperator;
-
-	    struct PriorityUpdateOperator;
-	    struct PriorityUpdateOperatorMin;
-	    struct PriorityUpdateOperatorSum;
-		struct UpdatePriorityEdgeCountEdgeSetApplyExpr;
+	struct OrderedProcessingOperator;
 
+	struct PriorityUpdateOperator;
+	struct PriorityUpdateOperatorMin;
+	struct PriorityUpdateOperatorSum;
+	struct UpdatePriorityEdgeCountEdgeSetApplyExpr;
+	
+	// GPU Additions
+	struct VertexSetDedupExpr;
+	struct HybridGPUStmt;
+	struct EnqueueVertex;
+	
 
         struct MIRVisitor {
             virtual void visit(Var*);
@@ -264,17 +269,22 @@ namespace graphit {
 
             virtual void visit(std::shared_ptr<OrderedProcessingOperator>);
 
-			virtual void visit(std::shared_ptr<PriorityUpdateOperator>);
+	    virtual void visit(std::shared_ptr<PriorityUpdateOperator>);
 
-			virtual void visit(std::shared_ptr<PriorityUpdateOperatorMin>);
+	    virtual void visit(std::shared_ptr<PriorityUpdateOperatorMin>);
 
-			virtual void visit(std::shared_ptr<PriorityUpdateOperatorSum>);
+	    virtual void visit(std::shared_ptr<PriorityUpdateOperatorSum>);
 
 
             virtual void visit(std::shared_ptr<UpdatePriorityEdgeCountEdgeSetApplyExpr>);
-	    
+	   
+	    // GPU Additions
+	    virtual void visit(std::shared_ptr<VertexSetDedupExpr>); 
+	    virtual void visit(std::shared_ptr<HybridGPUStmt>); 
+	    virtual void visit(std::shared_ptr<EnqueueVertex>);
+
+       	    protected:
 
-		protected:
             std::shared_ptr<MIRNode> node;
             LabelScope label_scope_;
             std::shared_ptr<FuncDecl> enclosing_func_decl_ = nullptr;
diff --git a/include/graphit/midend/priority_queue_frontier_reuse.h b/include/graphit/midend/priority_queue_frontier_reuse.h
new file mode 100644
index 00000000..eed4771f
--- /dev/null
+++ b/include/graphit/midend/priority_queue_frontier_reuse.h
@@ -0,0 +1,10 @@
+#ifndef PRIORITY_QUEUE_FRONTIER_REUSE_H
+#define PRIORITY_QUEUE_FRONTIER_REUSE_H
+#include <graphit/midend/mir_context.h>
+#include <graphit/
+namespace graphit {
+class PriorityQueueFrontierReuse {
+	
+};
+}
+#endif
diff --git a/include/graphit/midend/vector_op_lower.h b/include/graphit/midend/vector_op_lower.h
index bce9ff16..6d2f69cb 100644
--- a/include/graphit/midend/vector_op_lower.h
+++ b/include/graphit/midend/vector_op_lower.h
@@ -6,6 +6,7 @@
 #define GRAPHIT_VECTOR_OP_LOWER_H
 
 #include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
 
 namespace graphit {
     /**
@@ -15,7 +16,7 @@ namespace graphit {
      */
     class GlobalFieldVectorLower {
     public:
-        GlobalFieldVectorLower(MIRContext *mir_context) : mir_context_(mir_context){
+        GlobalFieldVectorLower(MIRContext *mir_context, Schedule *schedule) : mir_context_(mir_context), schedule_(schedule) {
 
         }
 
@@ -27,6 +28,7 @@ namespace graphit {
 
     private:
         MIRContext *mir_context_;
+        Schedule *schedule_ = nullptr;
 
     };
 }
diff --git a/include/graphit/midend/while_loop_fusion.h b/include/graphit/midend/while_loop_fusion.h
new file mode 100644
index 00000000..27a46427
--- /dev/null
+++ b/include/graphit/midend/while_loop_fusion.h
@@ -0,0 +1,24 @@
+#ifndef WHILE_LOOP_FUSION_H
+#define WHILE_LOOP_FUSION_H
+
+#include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
+#include <graphit/midend/mir_rewriter.h>
+
+namespace graphit {
+
+struct WhileLoopFusion: public mir::MIRVisitor {
+	using mir::MIRVisitor::visit;
+	WhileLoopFusion(MIRContext* mir_context, Schedule* schedule): mir_context_(mir_context), schedule_(schedule) {
+	}
+	void lower(void);
+protected:
+	virtual void visit(mir::WhileStmt::Ptr);
+private:
+	Schedule *schedule_ = nullptr;
+	MIRContext *mir_context_ = nullptr;
+};
+
+}
+
+#endif
diff --git a/src/backend/backend.cpp b/src/backend/backend.cpp
index 27b7b94b..1dff2ef5 100644
--- a/src/backend/backend.cpp
+++ b/src/backend/backend.cpp
@@ -17,4 +17,8 @@ namespace graphit{
 	delete codegen_python;
 	return flag;
     }
+    int Backend::emitGPU(std::ostream &oss, std::string module_name, std::string module_path) {
+        CodeGenGPU code_gen_gpu(oss, mir_context_, module_name, module_path);
+	return code_gen_gpu.genGPU();
+    }
 }
diff --git a/src/backend/codegen_cpp.cpp b/src/backend/codegen_cpp.cpp
index 8431c798..d00d175c 100644
--- a/src/backend/codegen_cpp.cpp
+++ b/src/backend/codegen_cpp.cpp
@@ -2045,7 +2045,7 @@ namespace graphit {
             oss << "new julienne::PriorityQueue <";
             priority_queue_alloc_expr->priority_type->accept(this);
             oss << " > ( ";
-
+	
             oss << mir_context_->getEdgeSets()[0]->name;
 
             if (priority_queue_alloc_expr->priority_update_type == mir::PriorityUpdateType::ReduceBeforePriorityUpdate){
diff --git a/src/backend/codegen_gpu/assign_function_context.cpp b/src/backend/codegen_gpu/assign_function_context.cpp
new file mode 100644
index 00000000..88ee26bc
--- /dev/null
+++ b/src/backend/codegen_gpu/assign_function_context.cpp
@@ -0,0 +1,45 @@
+#include "graphit/backend/codegen_gpu/assign_function_context.h"
+
+
+namespace graphit {
+int AssignFunctionContext::assign_function_context(void) {
+	const std::vector<mir::FuncDecl::Ptr> &functions = mir_context_->getFunctionList();
+	for (auto it = functions.begin(); it != functions.end(); it++)
+		it->get()->accept(this);	
+	for (auto stmt: mir_context_->field_vector_init_stmts)
+		stmt->accept(this);
+	
+}
+void AssignFunctionContext::visit(mir::PushEdgeSetApplyExpr::Ptr pesae) {
+	if (pesae->input_function && mir_context_->isFunction(pesae->input_function->function_name->name))
+		mir_context_->getFunction(pesae->input_function->function_name->name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (pesae->from_func && mir_context_->isFunction(pesae->from_func->function_name->name))
+		mir_context_->getFunction(pesae->from_func->function_name->name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (pesae->to_func && mir_context_->isFunction(pesae->to_func->function_name->name))
+		mir_context_->getFunction(pesae->to_func->function_name->name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+}
+void AssignFunctionContext::visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr pesae) {
+	if (pesae->input_function && mir_context_->isFunction(pesae->input_function->function_name->name))
+		mir_context_->getFunction(pesae->input_function->function_name->name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (pesae->from_func && mir_context_->isFunction(pesae->from_func->function_name->name))
+		mir_context_->getFunction(pesae->from_func->function_name->name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (pesae->to_func && mir_context_->isFunction(pesae->to_func->function_name->name))
+		mir_context_->getFunction(pesae->to_func->function_name->name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+}
+void AssignFunctionContext::visit(mir::PullEdgeSetApplyExpr::Ptr pesae) {
+	if (pesae->input_function && mir_context_->isFunction(pesae->input_function->function_name->name))
+		mir_context_->getFunction(pesae->input_function->function_name->name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (pesae->from_func && mir_context_->isFunction(pesae->from_func->function_name->name))
+		mir_context_->getFunction(pesae->from_func->function_name->name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (pesae->to_func && mir_context_->isFunction(pesae->to_func->function_name->name))
+		mir_context_->getFunction(pesae->to_func->function_name->name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+}
+void AssignFunctionContext::visit(mir::VertexSetApplyExpr::Ptr vsae) {
+	if (vsae->input_function && mir_context_->isFunction(vsae->input_function->function_name->name))
+		mir_context_->getFunction(vsae->input_function->function_name->name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+}
+void AssignFunctionContext::visit(mir::VertexSetWhereExpr::Ptr vswe) {
+	if (vswe->input_func && mir_context_->isFunction(vswe->input_func->function_name->name))
+		mir_context_->getFunction(vswe->input_func->function_name->name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+}
+}
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
new file mode 100644
index 00000000..8a3ebc6a
--- /dev/null
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -0,0 +1,1795 @@
+//
+// Created by Ajay Brahmakshatriya on 9/7/2019
+//
+
+#include <graphit/backend/codegen_gpu/codegen_gpu.h>
+#include <graphit/backend/codegen_gpu/assign_function_context.h>
+#include "graphit/backend/codegen_gpu/extract_read_write_set.h"
+#include <graphit/midend/mir.h>
+#include <cstring>
+#include <iostream>
+
+namespace graphit {
+int CodeGenGPU::genGPU() {
+	AssignFunctionContext assign_function_context(mir_context_);
+	assign_function_context.assign_function_context();
+
+
+	CodeGenGPUHost code_gen_gpu_host(oss, mir_context_, module_name, "");
+
+	genIncludeStmts();
+	
+	genGlobalDeclarations();
+
+	// This generates all the declarations of type GraphT<...>
+	genEdgeSets();
+
+	// Declare all the vertex properties
+	// We are only declaring the device versions now. If required we can generate the host versions later
+	for (auto constant: mir_context_->getLoweredConstants()) {
+		if ((mir::isa<mir::VectorType>(constant->type))) {
+			// This is some vertex data
+			genPropertyArrayDecl(constant);	
+		} else {
+			// This is some scalar variable w or w/o initialization
+			genScalarDecl(constant);
+		}
+	}	
+		
+	std::vector<mir::FuncDecl::Ptr> functions = mir_context_->getFunctionList();
+	// Before we generate any functions or kernels, we generate the function declarations
+	for (auto function: functions) {
+		if (function->name != "main")
+			genFuncDecl(function);
+	}
+	
+	// Every operator requires a kernel to be generated
+	// Create that first because all the actual functions will be calling these kernels
+	CodeGenGPUKernelEmitter kernel_emitter(oss, mir_context_);
+	for (auto function: functions)
+		function->accept(&kernel_emitter);		
+	
+	// All the fused kernels need to generated before we can acutally generate the functions
+	for (auto while_loop: mir_context_->fused_while_loops) 
+		genFusedWhileLoop(while_loop);
+
+	for (auto function: functions) {
+		if (function->function_context & mir::FuncDecl::function_context_type::CONTEXT_DEVICE)
+			function->accept(this);
+		if (function->function_context & mir::FuncDecl::function_context_type::CONTEXT_HOST)
+			function->accept(&code_gen_gpu_host);
+	}
+
+	oss << std::endl;
+	return 0;
+}
+
+void CodeGenGPU::genScalarDecl(mir::VarDecl::Ptr var_decl) {	
+	var_decl->type->accept(this);
+	oss << " __device__ " << var_decl->name << "; " << std::endl;
+	
+	var_decl->type->accept(this);
+	oss << " __host_" << var_decl->name << ";" << std::endl;
+
+	if (mir::isa<mir::PriorityQueueType>(var_decl->type)) {
+		var_decl->type->accept(this);
+		oss << " *__device_" << var_decl->name << ";" << std::endl;
+	}
+}
+void CodeGenGPU::genPropertyArrayDecl(mir::VarDecl::Ptr constant) {
+	mir::VectorType::Ptr vector_type = mir::to<mir::VectorType>(constant->type);
+	vector_type->vector_element_type->accept(this);
+	oss << " __device__ *" << constant->name << ";" << std::endl;
+
+	// Also generate the host versions of these arrays 
+	vector_type->vector_element_type->accept(this);
+	oss << " " << "*__host_" << constant->name << ";" << std::endl;
+	// Also generate the device pointer for easy copy
+	vector_type->vector_element_type->accept(this);
+	oss << " " << "*__device_" << constant->name << ";" << std::endl;
+}
+
+void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
+	auto vector_type = mir::to<mir::VectorType>(var_decl->type);
+	assert(vector_type != nullptr);
+
+	mir::Expr::Ptr size_expr = nullptr;	
+	if (vector_type->element_type != nullptr) {
+		size_expr = mir_context_->getElementCount(vector_type->element_type);
+		assert(size_expr != nullptr);
+	}
+	
+	
+	if (var_decl->initVal != nullptr && mir::isa<mir::Call>(var_decl->initVal)) {
+		printIndent();
+		oss << "__device_" << var_decl->name << " = ";
+		var_decl->initVal->accept(this);
+		oss << ";" << std::endl;
+	} else {
+		printIndent();
+		oss << "cudaMalloc(&__device_" << var_decl->name << ", ";
+		if (size_expr != nullptr)
+			size_expr->accept(this);
+		else
+			oss << vector_type->range_indexset;
+		oss << " * sizeof(";
+		vector_type->vector_element_type->accept(this);
+		oss << "));" << std::endl;
+	}
+	
+	printIndent();
+	oss << "cudaMemcpyToSymbol(";
+	oss << var_decl->name;
+	oss << ", &__device_" << var_decl->name << ", sizeof(";
+	vector_type->vector_element_type->accept(this);	
+	oss << "*), 0);" << std::endl;
+
+	printIndent();
+	oss << "__host_" << var_decl->name << " = new ";
+	vector_type->vector_element_type->accept(this);
+	oss << "[";
+	if (size_expr != nullptr)
+		size_expr->accept(this);
+	else
+		oss << vector_type->range_indexset;
+	oss << "];" << std::endl;
+	
+		
+}
+void KernelVariableExtractor::visit(mir::VarExpr::Ptr var_expr) {
+	if (mir_context_->isLoweredConst(var_expr->var.getName())) {
+		return;
+	}
+	
+	insertVar(var_expr->var);
+}
+void KernelVariableExtractor::visit(mir::VarDecl::Ptr var_decl) {
+	insertDecl(var_decl);
+}
+void KernelVariableExtractor::visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr esae) {
+	mir::MIRVisitor::visit(esae);
+	hoisted_pqs.push_back(esae->priority_queue_used);	
+}
+void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
+	
+	// First we generate a unique function name for this fused kernel
+	std::string fused_kernel_name = "fused_kernel_body_" + mir_context_->getUniqueNameCounterString();
+	while_stmt->fused_kernel_name = fused_kernel_name;
+
+	// Now we extract the list of variables that are used in the kernel that are not const 
+	// So we can hoist them
+	KernelVariableExtractor extractor(mir_context_);
+	while_stmt->accept(&extractor);
+
+	while_stmt->hoisted_vars = extractor.hoisted_vars;
+	while_stmt->hoisted_decls = extractor.hoisted_decls;
+	
+	CodeGenGPUFusedKernel codegen (oss, mir_context_, module_name, "");
+	codegen.current_while_stmt = while_stmt;
+	
+	oss << "// ";
+	for (auto var: extractor.hoisted_vars) 
+		oss << var.getName() << " ";
+	oss << std::endl;
+	
+	for (auto var: extractor.hoisted_vars) {	
+		var.getType()->accept(this);	
+		oss << " __device__ " << fused_kernel_name << "_" << var.getName() << ";" << std::endl;
+	}
+	codegen.kernel_hoisted_vars = extractor.hoisted_vars;
+	codegen.current_kernel_name = fused_kernel_name;
+	for (auto var: extractor.hoisted_pqs)
+		codegen.kernel_hoisted_vars.push_back(var);
+
+	oss << "void __global__ " << fused_kernel_name << "(void) {" << std::endl;	
+	codegen.indent();
+	codegen.printIndent();
+	oss << "grid_group _grid = this_grid();" << std::endl;
+	codegen.printIndent();
+	oss << "int32_t _thread_id = threadIdx.x + blockIdx.x * blockDim.x;" << std::endl;
+	// For all the variables we would also generate local copies in each thread
+	for (auto var: extractor.hoisted_vars) {	
+		codegen.printIndent();
+		oss << "auto __local_" << var.getName() << " = " << fused_kernel_name << "_" << var.getName() << ";" << std::endl;
+	}
+	for (auto var: extractor.hoisted_pqs) {
+		codegen.printIndent();
+		oss << "auto __local_" << var.getName() << " = " << var.getName() << ";" << std::endl;	
+	}
+	
+	codegen.printIndent();
+	oss << "while (";
+	while_stmt->cond->accept(&codegen);
+	oss << ") {" << std::endl;
+	codegen.indent();
+	while_stmt->body->accept(&codegen);
+	codegen.dedent();
+	codegen.printIndent();
+	oss << "}" << std::endl;
+
+	// After the kernel has ended, we should copy back all the variables
+	codegen.printIndent();
+	oss << "if (_thread_id == 0) {" << std::endl;
+	codegen.indent();
+	for (auto var: extractor.hoisted_vars) {	
+		codegen.printIndent();
+		oss << fused_kernel_name << "_" << var.getName() << " = " << "__local_" << var.getName() << ";" << std::endl;
+	}
+	for (auto var: extractor.hoisted_pqs) {
+		codegen.printIndent();
+		oss << var.getName() << " = __local_" << var.getName() << ";" << std::endl;
+	}
+	codegen.dedent();
+	codegen.printIndent();
+	oss << "}" << std::endl;	
+	codegen.dedent();
+	codegen.printIndent();
+	oss << "}" << std::endl;			
+
+	codegen.kernel_hoisted_vars.clear();
+}
+void CodeGenGPUFusedKernel::visit(mir::StmtBlock::Ptr stmt_block) {
+	for (auto stmt : *(stmt_block->stmts)) {
+		stmt->accept(this);
+	}
+}
+void CodeGenGPU::genFuncDecl(mir::FuncDecl::Ptr func_decl) {
+	if (func_decl->result.isInitialized()) {
+		func_decl->result.getType()->accept(this);
+	} else {
+		oss << "void";
+	}
+
+	if (func_decl->function_context & mir::FuncDecl::function_context_type::CONTEXT_DEVICE)
+		oss << " " << "__device__" << " " << func_decl->name << "(";
+	else
+		oss << " " << func_decl->name << "(";
+
+	bool printDelimeter = false;
+	for (auto arg: func_decl->args) {
+		if (printDelimeter)
+			oss << ", ";
+		arg.getType()->accept(this);
+		oss << " " << arg.getName();
+		printDelimeter = true;
+	}
+	oss << ");" << std::endl;
+}
+void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
+
+	// First we generate the function that is passed to the load balancing function
+
+	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
+
+	oss << "template <typename EdgeWeightType>" << std::endl;
+	oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
+	indent();
+	printIndent();
+	oss << "// Body of the actual operator code" << std::endl;
+	if (apply_expr->to_func && apply_expr->to_func->function_name->name != "") {
+		printIndent();
+		oss << "if (!" << apply_expr->to_func->function_name->name << "(dst))" << std::endl;
+		indent();
+		printIndent();
+		oss << "return;" << std::endl;
+		dedent();
+	}
+	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function->function_name->name);
+	// Enqueueing is disabled from here. We are now enqueing from the UDF 
+	if (apply_expr->is_weighted) {	
+		printIndent();
+		oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+		printIndent();
+		oss << apply_expr->input_function->function_name->name << "(src, dst, weight";
+	} else {
+		printIndent();
+		oss << apply_expr->input_function->function_name->name << "(src, dst";
+	}
+	if (apply_expr->requires_output)
+		oss << ", output_frontier";
+	oss << ");" << std::endl;
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;	
+	apply_expr->device_function = load_balancing_arg;
+	
+}
+
+void CodeGenGPUKernelEmitter::visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr apply_expr) {
+
+
+
+	if (apply_expr->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PUSH) {
+		// First we generate the function that is passed to the load balancing function
+		std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
+
+		oss << "template <typename EdgeWeightType>" << std::endl;
+		oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
+		indent();
+		printIndent();
+		oss << "// Body of the actual operator code" << std::endl;
+		if (apply_expr->to_func && apply_expr->to_func->function_name->name != "") {
+			printIndent();
+			oss << "if (!" << apply_expr->to_func->function_name->name << "(dst))" << std::endl;
+			indent();
+			printIndent();
+			oss << "return;" << std::endl;
+			dedent();
+		}
+		mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function->function_name->name);
+		// Enqueueing is disabled from here. We are now enqueing from the UDF 
+		if (apply_expr->is_weighted) {	
+			printIndent();
+			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+			printIndent();
+			oss << apply_expr->input_function->function_name->name << "(src, dst, weight";
+		} else {
+			printIndent();
+			oss << apply_expr->input_function->function_name->name << "(src, dst";
+		}
+		if (apply_expr->requires_output)
+			oss << ", output_frontier";
+		oss << ");" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;	
+		apply_expr->device_function = load_balancing_arg;	
+	} else if (apply_expr->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {
+		// First we generate the function that is passed to the load balancing function
+		std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
+		
+		oss << "template <typename EdgeWeightType>" << std::endl;
+		oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
+		indent();
+		printIndent();
+		oss << "// Body of the actual operator" << std::endl;
+		// Before we generate the call to the UDF, we have to check if the dst is on the input frontier
+		if (apply_expr->from_func && apply_expr->from_func->function_name->name != "") {	
+			if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
+				printIndent();
+				oss << "if (!input_frontier.d_byte_map_input[dst])" << std::endl;
+				indent();
+				printIndent();
+				oss << "return;" << std::endl;
+				dedent();
+			} else if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
+				printIndent();
+				oss << "if (!gpu_runtime::checkBit(input_frontier.d_bit_map_input, dst))" << std::endl;
+				indent();
+				printIndent();
+				oss << "return;" << std::endl;
+				dedent();
+			}
+		}
+
+		mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function->function_name->name);
+		// Enqueueing is disabled from here. We are now enqueing from the UDF 
+		if (apply_expr->is_weighted) {	
+			printIndent();
+			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+			printIndent();
+			oss << apply_expr->input_function->function_name->name << "(dst, src, weight";
+		} else {
+			printIndent();
+			oss << apply_expr->input_function->function_name->name << "(dst, src";
+		}
+		if (apply_expr->requires_output)
+			oss << ", output_frontier";
+		oss << ");" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;	
+		apply_expr->device_function = load_balancing_arg;
+	}
+}
+
+void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
+
+	// First we generate the function that is passed to the load balancing function
+	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
+	
+	oss << "template <typename EdgeWeightType>" << std::endl;
+	oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
+	indent();
+	printIndent();
+	oss << "// Body of the actual operator" << std::endl;
+	// Before we generate the call to the UDF, we have to check if the dst is on the input frontier
+	if (apply_expr->from_func && apply_expr->from_func->function_name->name != "") {	
+		if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
+			printIndent();
+			oss << "if (!input_frontier.d_byte_map_input[dst])" << std::endl;
+			indent();
+			printIndent();
+			oss << "return;" << std::endl;
+			dedent();
+		} else if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
+			printIndent();
+			oss << "if (!gpu_runtime::checkBit(input_frontier.d_bit_map_input, dst))" << std::endl;
+			indent();
+			printIndent();
+			oss << "return;" << std::endl;
+			dedent();
+		}
+	}
+
+	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function->function_name->name);
+	// Enqueueing is disabled from here. We are now enqueing from the UDF 
+	if (apply_expr->is_weighted) {	
+		printIndent();
+		oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+		printIndent();
+		oss << apply_expr->input_function->function_name->name << "(dst, src, weight";
+	} else {
+		printIndent();
+		oss << apply_expr->input_function->function_name->name << "(dst, src";
+	}
+	if (apply_expr->requires_output)
+		oss << ", output_frontier";
+	oss << ");" << std::endl;
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;	
+	apply_expr->device_function = load_balancing_arg;
+
+}
+
+void CodeGenGPU::genIncludeStmts(void) {
+	oss << "#include \"gpu_intrinsics.h\"" << std::endl;
+	oss << "#include <cooperative_groups.h>" << std::endl;
+	oss << "using namespace cooperative_groups;" << std::endl;
+}
+
+void CodeGenGPU::genGlobalDeclarations(void) {
+	for (auto stmt: mir_context_->hybrid_gpu_stmts) {
+		std::string threshold_var_name = "hybrid_threshold_var" + mir_context_->getUniqueNameCounterString();	
+		oss << "float " << threshold_var_name << ";" << std::endl;
+		oss << "float __device__ __device_" << threshold_var_name << ";" << std::endl;
+		stmt->threshold_var_name = threshold_var_name;
+	}
+	oss << "int32_t __delta_param;" << std::endl;	
+}
+
+void CodeGenGPU::genEdgeSets(void) {
+	for (auto edgeset: mir_context_->getEdgeSets()) {
+		auto edge_set_type = mir::to<mir::EdgeSetType>(edgeset->type);
+		edge_set_type->accept(this);
+		oss << " __device__ " << edgeset->name << ";" << std::endl;
+		edge_set_type->accept(this);
+		oss << " " << "__host_" << edgeset->name << ";" << std::endl;
+
+		bool requires_transpose = false;
+		bool requires_blocking = false;
+		uint32_t blocking_size = 0;
+		if (mir_context_->graphs_with_blocking.find(edgeset->name) != mir_context_->graphs_with_blocking.end()) {
+			blocking_size = mir_context_->graphs_with_blocking[edgeset->name];
+			auto edge_set_type = mir::to<mir::EdgeSetType>(edgeset->type);
+			edge_set_type->accept(this);
+			oss << " __device__ " << edgeset->name << "__blocked_" << blocking_size << ";" << std::endl;
+			edge_set_type->accept(this);
+			oss << " " << "__host_" << edgeset->name << "__blocked_" << blocking_size << ";" << std::endl;
+			requires_blocking = true;
+		}
+
+		if (mir_context_->graphs_with_transpose.find(edgeset->name) != mir_context_->graphs_with_transpose.end() && mir_context_->graphs_with_transpose[edgeset->name]) {
+			auto edge_set_type = mir::to<mir::EdgeSetType>(edgeset->type);
+			edge_set_type->accept(this);
+			oss << " __device__ " << edgeset->name << "__transposed" << ";" << std::endl;
+			edge_set_type->accept(this);
+			oss << " __host_" << edgeset->name << "__transposed" << ";" << std::endl;
+			requires_transpose = true;
+			
+		}
+		if (requires_transpose && requires_blocking) {
+			auto edge_set_type = mir::to<mir::EdgeSetType>(edgeset->type);
+			edge_set_type->accept(this);
+			oss << " __device__ " << edgeset->name << "__blocked_" << blocking_size << "__transposed" << ";" << std::endl;
+			edge_set_type->accept(this);
+			oss << " __host_" << edgeset->name << "__blocked_" << blocking_size << "__transposed" << ";" << std::endl;
+		}
+		
+		
+	}
+}
+
+void CodeGenGPU::visit(mir::EdgeSetType::Ptr edgeset_type) {
+	if (edgeset_type->weight_type != nullptr) {
+		oss << "gpu_runtime::GraphT<";
+		edgeset_type->weight_type->accept(this);
+		oss << ">";	
+	} else {
+		oss << "gpu_runtime::GraphT<char>";
+	}
+}
+
+void CodeGenGPU::visit(mir::PriorityQueueType::Ptr pqt) {
+	oss << "gpu_runtime::GPUPriorityQueue<";
+	pqt->priority_type->accept(this);
+	oss << ">";
+}
+
+void CodeGenGPU::visit(mir::VertexSetType::Ptr vertexset_type) {
+	oss << "gpu_runtime::VertexFrontier";
+}
+void CodeGenGPU::visit(mir::ListType::Ptr list_type) {
+	if (mir::isa<mir::VertexSetType>(list_type->element_type)) {
+		oss << "gpu_runtime::VertexFrontierList";
+		return;
+	}
+	oss << "std::vector<";
+	list_type->element_type->accept(this);
+	oss << ">";
+}
+void CodeGenGPU::visit(mir::ListAllocExpr::Ptr alloc_expr) {
+	if (mir::isa<mir::VertexSetType>(alloc_expr->element_type)) {
+		oss << "gpu_runtime::create_new_vertex_frontier_list(";		
+		mir::VertexSetType::Ptr vst = mir::to<mir::VertexSetType>(alloc_expr->element_type);
+		mir::Expr::Ptr size_expr = mir_context_->getElementCount(vst->element);
+		size_expr->accept(this);
+		oss << ")";
+		return;
+	}
+	oss << "std::vector<";
+	alloc_expr->element_type->accept(this);
+	oss << ">()";
+}
+void CodeGenGPU::visit(mir::ScalarType::Ptr scalar_type) {
+	switch(scalar_type->type) {
+		case mir::ScalarType::Type::INT:
+			oss << "int32_t";
+			break;
+		case mir::ScalarType::Type::UINT:
+			oss << "uint32_t";
+			break;
+		case mir::ScalarType::Type::FLOAT:
+			oss << "float";
+			break;
+		case mir::ScalarType::Type::DOUBLE:
+			oss << "double";
+			break;
+		case mir::ScalarType::Type::BOOL:
+			oss << "bool";
+			break;
+		case mir::ScalarType::Type::COMPLEX:
+			assert(false && "Complex type not yet supported with the GPU backend\n");
+			break;
+		case mir::ScalarType::Type::STRING:
+			assert(false && "String type not yet supported with the GPU backend\n");
+			break;
+		default:
+			assert(false && "Invalid type enum for scalar type\n");
+			break;
+	}
+}
+
+void CodeGenGPU::genHybridThresholds(void) {
+	for (auto stmt: mir_context_->hybrid_gpu_stmts) {
+		std::string var_name = stmt->threshold_var_name;
+		if (stmt->threshold < 0) {
+			printIndent();
+			oss << stmt->threshold_var_name << " = gpu_runtime::str_to_float(argv[" << stmt->argv_index << "]);" << std::endl;
+		} else {
+			printIndent();
+			oss << stmt->threshold_var_name << " = " << stmt->threshold << ";" << std::endl;
+		}
+		printIndent();
+		oss << "cudaMemcpyToSymbol(__device_" << stmt->threshold_var_name << ", &" << stmt->threshold_var_name << ", sizeof(float), 0);" << std::endl;
+	}
+}
+void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
+	if (func_decl->type == mir::FuncDecl::Type::EXTERNAL) {
+		assert(false && "GPU backend currently doesn't support external functions\n");
+	} else {
+		// First generate the signature of the function
+		if (func_decl->name == "main") {
+			oss << "int " << getBackendFunctionLabel() << " main(int argc, char* argv[])";
+		} else {
+			if (func_decl->result.isInitialized()) {
+				func_decl->result.getType()->accept(this);
+			} else {
+				oss << "void";
+			}
+			oss << " " << getBackendFunctionLabel() << " " << func_decl->name << "(";
+			bool printDelimeter = false;
+			for (auto arg: func_decl->args) {
+				if (printDelimeter)
+					oss << ", ";
+				arg.getType()->accept(this);
+				oss << " " << arg.getName();
+				printDelimeter = true;
+			}
+			oss << ")";	
+		}
+		oss << " {" << std::endl;
+		indent();
+
+		if (func_decl->name == "main") {
+			genHybridThresholds();
+			if (mir_context_->delta_ <= 0) {
+				printIndent();
+				oss << "__delta_param = gpu_runtime::str_to_int(argv[" << - mir_context_->delta_ << "]);" << std::endl;
+			} else {
+				printIndent();
+				oss << "__delta_param = " << mir_context_->delta_ << ";" << std::endl;
+			}
+			for (auto stmt: mir_context_->edgeset_alloc_stmts) {
+				mir::AssignStmt::Ptr assign_stmt = mir::to<mir::AssignStmt>(stmt);
+				mir::EdgeSetLoadExpr::Ptr edge_set_load_expr = mir::to<mir::EdgeSetLoadExpr>(assign_stmt->expr);
+				mir::VarExpr::Ptr lhs_var = mir::to<mir::VarExpr>(assign_stmt->lhs);
+				std::string var_name  = lhs_var->var.getName();
+				
+				printIndent();
+				oss << "gpu_runtime::load_graph(";
+				oss << "__host_" << var_name << ", ";
+				edge_set_load_expr->file_name->accept(this);
+				oss << ", false);" << std::endl;
+
+				printIndent();
+				oss << "cudaMemcpyToSymbol(";
+				oss << var_name << ", &__host_" << var_name << ", sizeof(__host_" << var_name << "), 0, cudaMemcpyHostToDevice);" << std::endl;
+				bool requires_blocking = false;
+				bool requires_transpose = false;
+				uint32_t blocking_size = 0;
+				if (mir_context_->graphs_with_blocking.find(var_name) != mir_context_->graphs_with_blocking.end()) {
+					blocking_size = mir_context_->graphs_with_blocking[var_name];		
+					requires_blocking = true;
+					printIndent();
+					oss << "gpu_runtime::block_graph_edges(__host_" << var_name << ", __host_" << var_name << "__blocked_" << blocking_size << ", " << blocking_size << ");" << std::endl;
+					printIndent();
+					oss << "cudaMemcpyToSymbol(";
+					oss << var_name << "__blocked_" << blocking_size << ", &__host_" << var_name << "__blocked_" << blocking_size << ", sizeof(__host_" << var_name << "__blocked_" << blocking_size << "), 0, cudaMemcpyHostToDevice);" << std::endl;
+				}
+
+				if (mir_context_->graphs_with_transpose.find(var_name) != mir_context_->graphs_with_transpose.end() && mir_context_->graphs_with_transpose[var_name]) {
+					requires_transpose = true;
+					printIndent();
+					oss << "__host_" << var_name << "__transposed = gpu_runtime::builtin_transpose(__host_" << var_name << ");" << std::endl;
+					printIndent();
+					oss << "cudaMemcpyToSymbol(";
+					oss << var_name << "__transposed" << ", &__host_" << var_name << "__transposed, sizeof(__host_" << var_name << "__transposed), 0, cudaMemcpyHostToDevice);" << std::endl;
+				}
+				if (requires_transpose && requires_blocking) {
+					printIndent();
+					oss << "gpu_runtime::block_graph_edges(__host_" << var_name << "__transposed, __host_" << var_name << "__blocked_" << blocking_size << "__transposed, " << blocking_size << ");" << std::endl;
+					printIndent();
+					oss << "cudaMemcpyToSymbol(";
+					oss << var_name << "__blocked_" << blocking_size << "__transposed, &__host_" << var_name << "__blocked_" << blocking_size << "__transposed, sizeof(__host_" << var_name << "__blocked_" << blocking_size << "__transposed), 0, cudaMemcpyHostToDevice);" << std::endl;
+					
+				}
+				
+
+			}
+			for (auto constant: mir_context_->getLoweredConstants()) {
+				if (mir::isa<mir::VectorType>(constant->type)) {
+					if (constant->needs_allocation) 
+						genPropertyArrayAlloca(constant);
+				} else {
+					if (constant->initVal != nullptr) {
+						printIndent();
+						oss << "__host_" << constant->name << " = ";
+						constant->initVal->accept(this);
+						oss << ";" << std::endl;
+						printIndent();
+						oss << "cudaMemcpyToSymbol(" << constant->name << ", &__host_" << constant->name << ", sizeof(";
+						constant->type->accept(this);
+						oss << "), 0, cudaMemcpyHostToDevice);" << std::endl;
+					}
+				}
+				if (mir::isa<mir::PriorityQueueType>(constant->type)) {
+					printIndent();
+					oss << "cudaGetSymbolAddress(((void**)&__device_" << constant->name << "), " << constant->name << ");" << std::endl;
+				}
+			}
+			for (auto stmt: mir_context_->field_vector_init_stmts) {
+				stmt->accept(this);
+			}
+		}
+		if (func_decl->body && func_decl->body->stmts) {
+			if (func_decl->result.isInitialized()) {
+				printIndent();
+				func_decl->result.getType()->accept(this);
+				oss << " " << func_decl->result.getName() << ";" << std::endl;
+			}	
+			func_decl->body->accept(this);	
+			if (func_decl->result.isInitialized()) {
+				printIndent();
+				oss << "return " << func_decl->result.getName() << ";" << std::endl;
+			}
+		}	
+		
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;
+	}
+}
+void CodeGenGPU::visit(mir::ElementType::Ptr element_type) {
+	oss << "int32_t";
+}
+void CodeGenGPU::genPriorityUpdateOperator(mir::PriorityUpdateOperator::Ptr puo) {
+	printIndent();
+	oss << "if (";
+	if (mir::isa<mir::PriorityUpdateOperatorMin>(puo)) {
+		mir::PriorityUpdateOperatorMin::Ptr puom = mir::to<mir::PriorityUpdateOperatorMin>(puo);
+		if (puom->is_atomic) {
+			oss << "gpu_runtime::writeMin";
+		} else {
+			assert(false && "Currently only atomic priority update is supported");
+		}
+		oss << "(";
+		oss << "&(";
+		//puom->priority_queue->accept(this);
+		oss << "__output_frontier.d_priority_array[";
+		puom->destination_node_id->accept(this);
+		oss << "]), ";
+		puom->new_val->accept(this);
+		oss << ")";
+	}
+	oss << " && ";
+	oss << "__output_frontier.d_priority_array[";
+	puo->destination_node_id->accept(this);
+	oss << "] < (";
+	//puo->priority_queue->accept(this);
+	oss << "__output_frontier.priority_cutoff)";
+	oss << ") {" << std::endl;
+	indent();
+
+	mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = puo->edgeset_apply_expr;	
+	mir::EnqueueVertex::Ptr evp = std::make_shared<mir::EnqueueVertex>();
+	evp->vertex_id = puo->destination_node_id;
+	mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(puo->priority_queue);
+	// Since this variable is created temporarily, we don;t need type
+	mir::Var var("__output_frontier", nullptr);
+	mir::VarExpr::Ptr frontier_expr = std::make_shared<mir::VarExpr>();
+	frontier_expr->var = var;	
+	
+	evp->vertex_frontier = frontier_expr;
+	if (upesae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+		evp->type = mir::EnqueueVertex::Type::SPARSE;
+	} else if (upesae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+		evp->type = mir::EnqueueVertex::Type::BOOLMAP;
+	} else if (upesae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+		evp->type = mir::EnqueueVertex::Type::BITMAP;
+	} 
+	
+	evp->accept(this);
+	dedent();
+	printIndent();	
+	oss << "}" << std::endl;
+
+}
+void CodeGenGPU::visit(mir::ExprStmt::Ptr expr_stmt) {
+	if (mir::isa<mir::EdgeSetApplyExpr>(expr_stmt->expr)) {
+		genEdgeSetApplyExpr(mir::to<mir::EdgeSetApplyExpr>(expr_stmt->expr), nullptr);
+	} else if (mir::isa<mir::PriorityUpdateOperatorMin>(expr_stmt->expr)) {
+		genPriorityUpdateOperator(mir::to<mir::PriorityUpdateOperatorMin>(expr_stmt->expr));
+	} else {
+		printIndent();
+		expr_stmt->expr->accept(this);
+		oss << ";" << std::endl;
+	}
+}
+
+void CodeGenGPU::visit(mir::VarExpr::Ptr var_expr) {
+	if (is_hoisted_var(var_expr->var)) {
+		oss << "__local_" << var_expr->var.getName();
+		return;
+	} else
+		oss << var_expr->var.getName();
+}
+void CodeGenGPUHost::visit(mir::VarExpr::Ptr var_expr) {
+	if (mir_context_->isLoweredConst(var_expr->var.getName())) {
+		oss << "__host_" << var_expr->var.getName();
+		return;
+	} else 
+		oss << var_expr->var.getName();
+
+}
+void CodeGenGPUFusedKernel::visit(mir::VarExpr::Ptr var_expr) {
+	if (is_hoisted_var(var_expr->var)) {
+		oss << "__local_" << var_expr->var.getName();
+		return;
+	} else 
+		oss << var_expr->var.getName();
+}
+void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr::Ptr target) {
+	if (target != nullptr && (esae->from_func == nullptr || esae->from_func->function_name->name == "")) {
+		assert(false && "GPU backend doesn't currently support creating output frontier without input frontier\n");
+	}		
+	// We will assume that the output frontier can reuse the input frontier. 
+	// Assert that the frontier can be reused
+	/*
+	if (target != nullptr && esae->frontier_reusable != true) {
+		assert(false && "GPU backend currently doesn't support creating frontiers from the apply expressions. Could not find opportunity for reuse\n");
+	}
+	*/
+
+	printIndent();
+	oss << "{" << std::endl;
+	indent();
+	
+	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
+	if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
+		load_balance_function = "gpu_runtime::TWCE_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY) {
+		load_balance_function = "gpu_runtime::edge_only_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWC) {
+		load_balance_function = "gpu_runtime::TWC_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::CM) {
+		load_balance_function = "gpu_runtime::CM_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::WM) {
+		load_balance_function = "gpu_runtime::WM_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::STRICT) {
+		load_balance_function = "gpu_runtime::strict_load_balance";
+	}
+
+	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae) && esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PUSH) {
+		if (esae->from_func && esae->from_func->function_name->name != "") {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_prepare_sparse(";
+			oss << esae->from_func->function_name->name;
+			oss << ");" << std::endl;
+		}
+	} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae) && esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {
+		if (esae->from_func && esae->from_func->function_name->name != "") {
+			if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
+				printIndent();
+				oss << "gpu_runtime::vertex_set_prepare_boolmap(";
+				oss << esae->from_func->function_name->name;
+				oss << ");" << std::endl;
+			} else if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
+				printIndent();
+				oss << "gpu_runtime::vertex_set_prepare_bitmap(";
+				oss << esae->from_func->function_name->name;
+				oss << ");" << std::endl;
+			}
+		}
+
+		std::string to_func ;
+		if (esae->to_func)
+			to_func = esae->to_func->function_name->name;
+		else 
+			to_func = "";
+		if (to_func != "") {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue_host<" << to_func << ">(";
+			oss << esae->from_func->function_name->name << ");" << std::endl;
+		}
+
+	}
+
+	// We will have to create a new frontier in case the frontier cannot be reused
+	// If the frontier is reusable, we simply assign the old to the new
+	if (target != nullptr) {
+		if (esae->frontier_reusable) {
+			printIndent();
+			target->accept(this);
+			oss << " = " << esae->from_func->function_name->name << ";" << std::endl;
+		} else {
+			printIndent();
+			target->accept(this);
+			oss << " = ";
+			oss << "gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(";
+			esae->target->accept(this);
+			oss << "), 0);" << std::endl;
+		}
+	}
+	if (mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
+		mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = mir::to<mir::UpdatePriorityEdgeSetApplyExpr>(esae);
+		printIndent();
+		oss << "cudaMemcpyToSymbol(" << upesae->priority_queue_used.getName() << ", &__host_" << upesae->priority_queue_used.getName() << ", sizeof(" << upesae->priority_queue_used.getName() << "), 0);" << std::endl;
+	}
+
+	// Before the load balance if the update requires dedup, then update the counters
+	if (esae->fused_dedup && target != nullptr) {
+		printIndent();
+		target->accept(this);
+		oss << ".curr_dedup_counter++;" << std::endl;
+	}	
+	printIndent();
+	oss << load_balance_function << "_host<";
+
+	mir::Var target_var = mir::to<mir::VarExpr>(esae->target)->var;
+	mir::EdgeSetType::Ptr target_type = mir::to<mir::EdgeSetType>(target_var.getType());
+	if (target_type->weight_type == nullptr)
+		oss << "char";
+	else
+		target_type->weight_type->accept(this);
+
+	std::string accessor_type = "gpu_runtime::AccessorSparse";
+	if (!esae->from_func || esae->from_func->function_name->name == "")
+		accessor_type = "gpu_runtime::AccessorAll";
+	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && (esae->from_func == nullptr || esae->to_func->function_name->name == ""))
+		accessor_type = "gpu_runtime::AccessorAll";
+	std::string src_filter = "gpu_runtime::true_function";
+	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->from_func && esae->to_func->function_name->name != "")
+		src_filter = esae->to_func->function_name->name;
+
+	oss << ", " << esae->device_function << ", " << accessor_type << ", " << src_filter << ">(";
+	esae->target->accept(this);
+	if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY && esae->applied_schedule.edge_blocking == fir::gpu_schedule::SimpleGPUSchedule::edge_blocking_type::BLOCKED) {
+		oss << "__blocked_" << esae->applied_schedule.edge_blocking_size;
+	}
+	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {
+		oss << "__transposed";
+	}
+	oss << ", ";
+	if (esae->from_func && esae->from_func->function_name->name != "")
+		oss << esae->from_func->function_name->name;
+	else {
+		esae->target->accept(this);
+		oss << ".getFullFrontier()";
+	}
+	oss << ", ";
+	if (target != nullptr)
+		target->accept(this);
+	else 
+		oss << "gpu_runtime::sentinel_frontier";
+	oss << ");" << std::endl;
+
+
+	printIndent();
+	oss << "cudaDeviceSynchronize();" << std::endl;
+	if (target != nullptr) {
+		if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+			printIndent();
+			oss << "gpu_runtime::swap_queues(";
+			target->accept(this);
+			oss << ");" << std::endl;
+			printIndent();
+			target->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
+		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+			printIndent();
+			oss << "gpu_runtime::swap_bitmaps(";
+			target->accept(this);
+			oss << ");" << std::endl;
+			printIndent();
+			target->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::BITMAP;" << std::endl;
+		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+			printIndent();
+			oss << "gpu_runtime::swap_bytemaps(";
+			target->accept(this);
+			oss << ");" << std::endl;
+			printIndent();
+			target->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::BYTEMAP;" << std::endl;
+		}
+	}
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+
+}
+void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr::Ptr target) {
+	if (target != nullptr && (esae->from_func == nullptr || esae->from_func->function_name->name == "")) {
+		assert(false && "GPU backend doesn't currently support creating output frontier without input frontier\n");
+	}
+	printIndent();
+	oss << "{" << std::endl;
+	indent();
+	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
+	if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
+		load_balance_function = "gpu_runtime::TWCE_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY) {
+		load_balance_function = "gpu_runtime::edge_only_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWC) {
+		load_balance_function = "gpu_runtime::TWC_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::CM) {
+		load_balance_function = "gpu_runtime::CM_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::WM) {
+		load_balance_function = "gpu_runtime::WM_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::STRICT) {
+		load_balance_function = "gpu_runtime::strict_load_balance";
+	}
+	
+	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae) && esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PUSH) {
+		printIndent();
+		oss << "gpu_runtime::vertex_set_prepare_sparse_device(";
+		oss << var_name(esae->from_func->function_name->name);
+		oss << ");" << std::endl;
+	} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae) && esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {
+		if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_prepare_boolmap_device(";
+			oss << var_name(esae->from_func->function_name->name);
+			oss << ");" << std::endl;
+		} else if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_prepare_bitmap_device(";
+			oss << var_name(esae->from_func->function_name->name);
+			oss << ");" << std::endl;
+		}
+		std::string to_func;
+		if (esae->to_func)
+			to_func = esae->to_func->function_name->name;
+		else
+			to_func = "";
+                
+		if (to_func != "") {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue_device<" << to_func << ">(";
+			oss << var_name(esae->from_func->function_name->name) << ");" << std::endl;
+		}
+	}
+	if (target != nullptr) {
+		printIndent();
+		target->accept(this);	
+		oss << " = " << var_name(esae->from_func->function_name->name) << ";" << std::endl;
+	}
+	if (mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
+		mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = mir::to<mir::UpdatePriorityEdgeSetApplyExpr>(esae);
+		insertUsedPq(upesae->priority_queue_used);
+	}
+	if (mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
+/*
+		mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = mir::to<mir::UpdatePriorityEdgeSetApplyExpr>(esae);
+		printIndent();
+		oss << "if (_thread_id == 0) {" << std::endl;
+		indent();
+		printIndent();
+		oss << upesae->priority_queue_used.getName() << " = __local_" << upesae->priority_queue_used.getName() << ";" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;
+		printIndent();
+		oss << "_grid.sync();" << std::endl;
+		//oss << "cudaMemcpyToSymbol(" << upesae->priority_queue_used.getName() << ", &__host_" << upesae->priority_queue_used.getName() << ", sizeof(" << upesae->priority_queue_used.getName() << "), 0);" << std::endl;
+*/
+	}
+	// Before the load balance if the update requires dedup, then update the counters
+	if (esae->fused_dedup && target != nullptr) {
+		printIndent();
+		target->accept(this);
+		oss << ".curr_dedup_counter++;" << std::endl;
+	}	
+	printIndent();
+	oss << load_balance_function << "_device<";
+	
+	mir::Var target_var = mir::to<mir::VarExpr>(esae->target)->var;
+	mir::EdgeSetType::Ptr target_type = mir::to<mir::EdgeSetType>(target_var.getType());
+	if (target_type->weight_type == nullptr)
+		oss << "char";
+	else
+		target_type->weight_type->accept(this);
+	
+	std::string accessor_type = "gpu_runtime::AccessorSparse";
+	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && (esae->to_func == nullptr || esae->to_func->function_name->name == ""))
+		accessor_type = "gpu_runtime::AccessorAll";
+	std::string src_filter = "gpu_runtime::true_function";
+	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func && esae->to_func->function_name->name != "")
+		src_filter = esae->to_func->function_name->name;
+
+	oss << ", " << esae->device_function << ", " << accessor_type << ", " << src_filter << ">(";
+	esae->target->accept(this);
+	oss << ", " << var_name(esae->from_func->function_name->name) << ", ";
+	if (target != nullptr) 
+		target->accept(this);
+	else 
+		oss << "gpu_runtime::device_sentinel_frontier";
+	oss << ");" << std::endl;
+	
+	if (target != nullptr) {
+		mir::VarExpr::Ptr target_expr = mir::to<mir::VarExpr>(target);
+		if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+			printIndent();
+			oss << "gpu_runtime::swap_queues_device(";
+			target->accept(this);
+			oss << ");" << std::endl;	
+			printIndent();
+			target->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
+		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+			printIndent();
+			oss << "gpu_runtime::swap_bitmaps_device(";
+			target->accept(this);
+			oss << ");" << std::endl;
+			printIndent();
+			target->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::BITMAP;" << std::endl;
+		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+			printIndent();
+			oss << "gpu_runtime::swap_bytemaps_device(";
+			target->accept(this);
+			oss << ");" << std::endl;
+			printIndent();
+			target->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::BYTEMAP;" << std::endl;
+		}
+	}
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+	
+}
+void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
+	if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
+		mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);	
+		genEdgeSetApplyExpr(esae, assign_stmt->lhs);
+	} else if (mir::isa<mir::PriorityQueueAllocExpr>(assign_stmt->expr)) {
+		mir::PriorityQueueAllocExpr::Ptr pqae = mir::to<mir::PriorityQueueAllocExpr>(assign_stmt->expr);	
+		printIndent();
+		assign_stmt->lhs->accept(this);
+		oss << ".init(";
+		std::string graph_name = mir_context_->getEdgeSets()[0]->name;	
+		oss << "__host_" << graph_name << ", ";
+		std::string vector_name = pqae->vector_function;
+		if (mir_context_->isLoweredConst(vector_name))
+			oss << "__host_" << vector_name;
+		else
+			oss << vector_name;
+		oss << ", ";
+		if (mir_context_->isLoweredConst(vector_name))
+			oss << "__device_" << vector_name;
+		else
+			oss << vector_name;
+		oss << ", 0, ";
+		oss << "__delta_param";
+		oss << ", ";
+		pqae->starting_node->accept(this);
+		oss << ");" << std::endl;	
+	} else if(mir::isa<mir::VertexSetWhereExpr>(assign_stmt->expr)) {
+                mir::VertexSetWhereExpr::Ptr vswe = mir::to<mir::VertexSetWhereExpr>(assign_stmt->expr);
+		if(!mir_context_->isConstVertexSet(vswe->target)) {
+			assert(false && "GPU backend currently doesn't support vertex where on non-const sets");
+		}
+		auto associated_element_type = mir_context_->getElementTypeFromVectorOrSetName(vswe->target);
+		assert(associated_element_type != nullptr);
+		auto associated_edge_set = mir_context_->getEdgeSetFromElementType(associated_element_type);
+		assert(associated_edge_set != nullptr);
+		
+		printIndent();
+		assign_stmt->lhs->accept(this);
+		oss << " = ";
+		oss << "gpu_runtime::create_new_vertex_set(";
+		oss << "__host_" << associated_edge_set->name << ".num_vertices, 0);" << std::endl;
+		printIndent();
+		oss << "gpu_runtime::vertex_set_where<";
+		oss << vswe->input_func << ">";
+		oss << "(__host_" << associated_edge_set->name << ".num_vertices, ";
+		assign_stmt->lhs->accept(this);
+		oss << ");" << std::endl;
+				
+	} else {
+		printIndent();
+		assign_stmt->lhs->accept(this);
+		oss << " = ";
+		assign_stmt->expr->accept(this);
+		oss << ";" << std::endl;
+	}
+}
+
+
+void CodeGenGPUFusedKernel::visit(mir::AssignStmt::Ptr assign_stmt) {
+	if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
+		mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
+		genEdgeSetApplyExpr(esae, assign_stmt->lhs);
+	} else {
+		if (mir::isa<mir::VarExpr>(assign_stmt->lhs) && is_hoisted_var(mir::to<mir::VarExpr>(assign_stmt->lhs)->var)) {
+			printIndent();
+			assign_stmt->lhs->accept(this);
+			oss << " = ";
+			assign_stmt->expr->accept(this);
+			oss << ";" << std::endl;
+		} else {
+			printIndent();
+			oss << "if (_thread_id == 0) " << std::endl;
+			indent();
+			printIndent();
+			assign_stmt->lhs->accept(this);
+			oss << " = ";
+			assign_stmt->expr->accept(this);
+			oss << ";" << std::endl;	
+			dedent();
+			printIndent();
+			oss << "_grid.sync();" << std::endl;
+		}
+	}	
+}
+
+void CodeGenGPU::generateBinaryExpr(mir::BinaryExpr::Ptr expr, std::string token) {
+	oss << "(";
+	expr->lhs->accept(this);
+	oss << " " << token << " ";
+	expr->rhs->accept(this);
+	oss << ")";
+}
+void CodeGenGPU::visit(mir::AddExpr::Ptr expr) {
+	generateBinaryExpr(expr, "+");
+}
+void CodeGenGPU::visit(mir::MulExpr::Ptr expr) {
+	generateBinaryExpr(expr, "*");
+}
+void CodeGenGPU::visit(mir::DivExpr::Ptr expr) {
+	generateBinaryExpr(expr, "/");
+}
+void CodeGenGPU::visit(mir::SubExpr::Ptr expr) {
+	generateBinaryExpr(expr, "-");
+}
+void CodeGenGPU::visit(mir::NegExpr::Ptr expr) {
+	if (expr->negate)
+		oss << "-";
+	oss << "(";
+	expr->operand->accept(this);
+	oss << ")";
+}
+
+
+void CodeGenGPU::visit(mir::TensorArrayReadExpr::Ptr expr) {
+	expr->target->accept(this);
+	oss << "[";
+	expr->index->accept(this);
+	oss << "]";	
+}
+void CodeGenGPUHost::visit(mir::TensorArrayReadExpr::Ptr expr) {
+	mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(expr->target);
+	expr->target->accept(this);
+	oss << "[";
+	expr->index->accept(this);
+	oss << "]";
+}
+
+void CodeGenGPU::visit(mir::IntLiteral::Ptr expr) {
+	oss << expr->val;
+}
+void CodeGenGPU::visit(mir::FloatLiteral::Ptr expr) {
+	oss << "((float)" << expr->val << ")";
+}
+void CodeGenGPU::visit(mir::StringLiteral::Ptr expr) {
+	oss << "\"";
+	for (auto ch : expr->val)
+		if (iscntrl(ch) || ch == '\\' || ch == '\"' || ch == '\'')
+			oss << "\\0" << std::oct << (int)(ch);	
+		else
+			oss << ch;
+	oss << "\"";
+}
+void CodeGenGPU::visit(mir::ReduceStmt::Ptr reduce_stmt) {
+	switch (reduce_stmt->reduce_op_) {
+		case mir::ReduceStmt::ReductionOp::SUM:
+			printIndent();
+			reduce_stmt->lhs->accept(this);
+			oss << " += ";
+			reduce_stmt->expr->accept(this);
+			oss << ";" << std::endl;
+			if (reduce_stmt->tracking_var_name_ != "") {
+				printIndent();
+				oss << reduce_stmt->tracking_var_name_ << " = true;" << std::endl;
+			}
+			break;
+		case mir::ReduceStmt::ReductionOp::MIN:
+			printIndent();
+			oss << "if ((";
+			reduce_stmt->lhs->accept(this);
+			oss << ") > (";
+			reduce_stmt->expr->accept(this);
+			oss << ")) {" << std::endl;
+			indent();
+			printIndent();
+			reduce_stmt->lhs->accept(this);
+			oss << " = ";
+			reduce_stmt->expr->accept(this);
+			oss << ";" << std::endl;
+
+			if (reduce_stmt->tracking_var_name_ != "") {
+				printIndent();
+				oss << reduce_stmt->tracking_var_name_ << " = true;" << std::endl;
+			}
+			dedent();
+			printIndent();
+			oss << "}" << std::endl;
+			break;
+		case mir::ReduceStmt::ReductionOp::MAX:
+			printIndent();
+			oss << "if ((";
+			reduce_stmt->lhs->accept(this);
+			oss << ") < (";
+			reduce_stmt->expr->accept(this);
+			oss << ")) {" << std::endl;
+			indent();
+			printIndent();
+			reduce_stmt->lhs->accept(this);
+			oss << " = ";
+			reduce_stmt->expr->accept(this);
+			oss << ";" << std::endl;
+
+			if (reduce_stmt->tracking_var_name_ != "") {
+				printIndent();
+				oss << reduce_stmt->tracking_var_name_ << " = true;" << std::endl;
+			}
+			dedent();
+			printIndent();
+			oss << "}" << std::endl;
+			break;
+		case mir::ReduceStmt::ReductionOp::ATOMIC_MIN:
+			printIndent();
+			if (reduce_stmt->tracking_var_name_ != "") 
+				oss << reduce_stmt->tracking_var_name_ << " = ";
+			oss << "gpu_runtime::writeMin(&";
+			reduce_stmt->lhs->accept(this);
+			oss << ", ";
+			reduce_stmt->expr->accept(this);
+			oss << ");" << std::endl;
+			break;
+		case mir::ReduceStmt::ReductionOp::ATOMIC_SUM:
+			if (reduce_stmt->tracking_var_name_ != "") {
+				printIndent();
+				oss << reduce_stmt->tracking_var_name_ << " = true;" << std::endl;
+			}
+			printIndent();
+			oss << "gpu_runtime::writeAdd(&";
+			reduce_stmt->lhs->accept(this);
+			oss << ", ";
+			reduce_stmt->expr->accept(this);
+			oss << ");" << std::endl;
+			break;
+	}	
+
+}
+
+void CodeGenGPU::visit(mir::EnqueueVertex::Ptr enqueue_vertex) {
+	printIndent();
+	if (enqueue_vertex->type == mir::EnqueueVertex::Type::SPARSE) {
+		oss << "gpu_runtime::enqueueVertexSparseQueue";
+		if (enqueue_vertex->fused_dedup) {
+			oss << "Dedup";
+			if (enqueue_vertex->fused_dedup_perfect) {
+				oss <<"Perfect";
+			}
+		}
+		oss << "(";
+		enqueue_vertex->vertex_frontier->accept(this);
+		oss << ".d_sparse_queue_output";
+	} else if (enqueue_vertex->type == mir::EnqueueVertex::Type::BOOLMAP) {
+		oss << "gpu_runtime::enqueueVertexBytemap(";
+		enqueue_vertex->vertex_frontier->accept(this);
+		oss << ".d_byte_map_output";
+	} else if (enqueue_vertex->type == mir::EnqueueVertex::Type::BITMAP) {
+		oss << "gpu_runtime::enqueueVertexBitmap(";
+		enqueue_vertex->vertex_frontier->accept(this);
+		oss << ".d_bit_map_output";
+	}
+	oss << ", ";
+	enqueue_vertex->vertex_frontier->accept(this);
+	oss << ".d_num_elems_output, ";
+	enqueue_vertex->vertex_id->accept(this);
+	if (enqueue_vertex->type == mir::EnqueueVertex::Type::SPARSE && enqueue_vertex->fused_dedup == true) {
+		oss << ", ";
+		enqueue_vertex->vertex_frontier->accept(this);	
+	}
+	oss << ");" << std::endl;	
+	
+}
+
+void CodeGenGPU::visit(mir::CompareAndSwapStmt::Ptr cas_stmt) {
+	printIndent();
+	if (cas_stmt->tracking_var_ != "") 
+		oss << cas_stmt->tracking_var_ << " = ";
+	oss << "gpu_runtime::CAS(&";
+	cas_stmt->lhs->accept(this);
+	oss << ", ";
+	cas_stmt->compare_val_expr->accept(this);
+	oss << ", ";
+	cas_stmt->expr->accept(this);
+	oss << ");" << std::endl;
+}
+void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
+	
+	printIndent();
+	var_decl->type->accept(this);
+	
+	oss << " " << var_decl->name;
+	
+	if (var_decl->initVal != nullptr) {
+		// Special case if RHS is a EdgeSetApplyExpr
+		oss << " = ";
+		var_decl->initVal->accept(this);
+		oss << ";" << std::endl;
+		
+	} else 
+		oss << ";" << std::endl;
+
+	if (mir::isa<mir::EdgeSetType>(var_decl->type)) {
+		if (mir_context_->graphs_with_transpose.find(var_decl->name) != mir_context_->graphs_with_transpose.end() && mir_context_->graphs_with_transpose[var_decl->name]) {
+			printIndent();
+			var_decl->type->accept(this);
+			oss << " " << var_decl->name << "__transposed = ";
+			oss << "gpu_runtime::builtin_transpose(" << var_decl->name << ");" << std::endl;
+		}
+	}		
+	
+}
+void CodeGenGPUFusedKernel::visit(mir::VarDecl::Ptr var_decl) {
+	// Do nothing for variable declarations on kernel only lower the initialization as assignment
+	if (var_decl->initVal != nullptr) {
+		printIndent();
+		oss << "__local_" << var_decl->name << " = ";
+		var_decl->initVal->accept(this);
+		oss << ";" << std::endl;
+	}
+}
+void CodeGenGPU::visit(mir::VertexSetDedupExpr::Ptr vsde) {
+	if (vsde->perfect_dedup)
+		oss << "gpu_runtime::dedup_frontier_perfect(";
+	else
+		oss << "gpu_runtime::dedup_frontier(";
+	vsde->target->accept(this);
+	oss << ")";
+}
+void CodeGenGPUFusedKernel::visit(mir::VertexSetDedupExpr::Ptr vsde) {
+	oss << "gpu_runtime::dedup_frontier_device(";
+	vsde->target->accept(this);
+	oss << ")";
+}
+void CodeGenGPU::visit(mir::BoolLiteral::Ptr bool_literal) {
+	oss << bool_literal->val?"true":"false";
+}
+void CodeGenGPU::visit(mir::ForStmt::Ptr for_stmt) {
+	printIndent();
+	oss << "for (int32_t " << for_stmt->loopVar << " = ";
+	for_stmt->domain->lower->accept(this);
+	oss << "; " << for_stmt->loopVar << " < ";
+	for_stmt->domain->upper->accept(this);
+	oss << "; " << for_stmt->loopVar << "++) {" << std::endl;
+	indent();
+	for_stmt->body->accept(this);
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+}
+void CodeGenGPU::visit(mir::WhileStmt::Ptr while_stmt) {
+	if (while_stmt->is_fused == true) {
+		for (auto var: while_stmt->hoisted_vars) {
+			bool to_copy = true;
+			for (auto decl: while_stmt->hoisted_decls) {
+				if (decl->name == var.getName()) {
+					to_copy = false;
+					break;
+				}
+			}
+			if (!to_copy)
+				continue;
+			printIndent();
+			oss << "cudaMemcpyToSymbol(" << while_stmt->fused_kernel_name << "_" << var.getName() << ", &" << var.getName() << ", sizeof(" << var.getName() << "), 0, cudaMemcpyHostToDevice);" << std::endl;
+		}
+		for (auto var: while_stmt->used_priority_queues) {
+			printIndent();
+			oss << "cudaMemcpyToSymbol(" << var.getName() << ", &__host_" << var.getName() << ", sizeof(__host_" << var.getName() << "), 0);" << std::endl;
+		}
+		printIndent();
+		oss << "cudaLaunchCooperativeKernel((void*)" << while_stmt->fused_kernel_name << ", NUM_CTA, CTA_SIZE, gpu_runtime::no_args);" << std::endl;
+		for (auto var: while_stmt->used_priority_queues) {
+			printIndent();
+			oss << "cudaMemcpyFromSymbol(&__host_" << var.getName() << ", " << var.getName() << ", sizeof(__host_" << var.getName() << "), 0);" << std::endl;
+		}
+		for (auto var: while_stmt->hoisted_vars) {
+			bool to_copy = true;
+			for (auto decl: while_stmt->hoisted_decls) {
+				if (decl->name == var.getName()) {
+					to_copy = false;
+					break;
+				}
+			}
+			if (!to_copy)
+				continue;
+			printIndent();
+			oss << "cudaMemcpyFromSymbol(&" << var.getName() << ", " << while_stmt->fused_kernel_name << "_" << var.getName() << ", sizeof(" << var.getName() << "), 0, cudaMemcpyDeviceToHost);" << std::endl;
+		}
+		return;
+	}
+
+	ExtractReadWriteSet extractor(mir_context_);
+	while_stmt->cond->accept(&extractor);
+	
+	printIndent();
+	oss << "while (";
+	while_stmt->cond->accept(this);
+	oss << ") {" << std::endl;
+	indent();
+	for (auto tare: extractor.write_set) {
+		generateHostToDeviceCopy(tare);
+	}
+	while_stmt->body->accept(this);
+	for (auto tare: extractor.read_set) {
+		generateDeviceToHostCopy(tare);
+	}
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+}
+void CodeGenGPU::visit(mir::IfStmt::Ptr if_stmt) {
+	printIndent();
+	oss << "if (";
+	if_stmt->cond->accept(this);
+	oss << ") {" << std::endl;
+	indent();
+	if_stmt->ifBody->accept(this);
+	dedent();
+	printIndent();
+	oss << "}";
+	if (if_stmt->elseBody != nullptr) {
+		oss << " else {" << std::endl;
+		indent();
+		if_stmt->elseBody->accept(this);
+		dedent();
+		printIndent();
+		oss << "}";
+	}	
+	oss << std::endl;
+}
+void CodeGenGPUHost::visit(mir::PrintStmt::Ptr print_stmt) {
+	printIndent();
+	oss << "std::cout << ";
+	print_stmt->expr->accept(this);
+	oss << " << std::endl;" << std::endl;
+}
+void CodeGenGPU::visit(mir::PrintStmt::Ptr print_stmt) {
+	assert(false && "Cannot print from device function\n");
+}
+void CodeGenGPUFusedKernel::visit(mir::PrintStmt::Ptr print_stmt) {
+	printIndent();
+	oss << "if (_thread_id == 0)" << std::endl;
+	indent();
+	printIndent();
+	oss << "gpu_runtime::print(";
+	print_stmt->expr->accept(this);
+	oss << ");" << std::endl;
+	dedent();
+	printIndent();
+	oss << "_grid.sync();" << std::endl;
+}
+void CodeGenGPUHost::visit(mir::Call::Ptr call_expr) {
+	if (call_expr->name == "dequeue_ready_set" || call_expr->name == "finished") {
+		if (call_expr->name == "dequeue_ready_set")
+			call_expr->name = "dequeueReadySet";
+		mir::VarExpr::Ptr pq_expr = mir::to<mir::VarExpr>(call_expr->args[0]);
+		std::string pq_name = pq_expr->var.getName();
+		
+		oss << "__host_" << pq_name << "." << call_expr->name << "(__device_" << pq_name << ")";
+		return;
+	}
+	if (call_expr->name == "deleteObject" || call_expr->name.substr(0, strlen("builtin_")) == "builtin_")	
+		oss << "gpu_runtime::" << call_expr->name << "(";
+	else
+		oss << call_expr->name << "(";
+	
+	bool printDelimeter = false;
+	for (auto arg: call_expr->args) {
+		if (printDelimeter) 
+			oss << ", ";
+		arg->accept(this);
+		printDelimeter = true;
+	}	
+	oss << ")";
+}
+
+void CodeGenGPU::visit(mir::Call::Ptr call_expr) {
+	if (call_expr->name == "dequeue_ready_set" || call_expr->name == "finished") {
+		if (call_expr->name == "dequeue_ready_set")
+			call_expr->name = "dequeueReadySet";
+		mir::VarExpr::Ptr pq_expr = mir::to<mir::VarExpr>(call_expr->args[0]);
+		pq_expr->accept(this);
+		oss << ".device_" << call_expr->name << "()";
+		return;
+	}
+	if (call_expr->name == "deleteObject" || call_expr->name.substr(0, strlen("builtin_")) == "builtin_")	
+		oss << "gpu_runtime::device_" << call_expr->name << "(";
+	else
+		oss << call_expr->name << "(";
+	
+	bool printDelimeter = false;
+	for (auto arg: call_expr->args) {
+		if (printDelimeter) 
+			oss << ", ";
+		arg->accept(this);
+		printDelimeter = true;
+	}	
+	oss << ")";
+}
+
+void CodeGenGPU::visit(mir::EqExpr::Ptr eq_expr) {
+	oss << "(";
+	eq_expr->operands[0]->accept(this);
+	oss << ")";
+
+	for (unsigned i = 0; i < eq_expr->ops.size(); ++i) {
+		switch(eq_expr->ops[i]) {
+			case mir::EqExpr::Op::LT:
+				oss << " < ";
+				break;
+			case mir::EqExpr::Op::LE:
+				oss << " <= ";
+				break;
+			case mir::EqExpr::Op::GT:
+				oss << " > ";
+				break;
+			case mir::EqExpr::Op::GE:
+				oss << " >= ";
+				break;
+			case mir::EqExpr::Op::EQ:
+				oss << " == ";
+				break;
+			case mir::EqExpr::Op::NE:
+				oss << " != ";
+				break;
+			default:
+				assert(false && "Invalid operator for EqExpr\n");
+
+		}
+		oss << "(";
+		eq_expr->operands[i+1]->accept(this);
+		oss << ")";
+	}
+}
+void CodeGenGPU::visit(mir::BreakStmt::Ptr break_stmt) {
+	printIndent();
+	oss << "break;" << std::endl;
+}
+void CodeGenGPU::visit(mir::VertexSetApplyExpr::Ptr vsae) {
+	auto mir_var = mir::to<mir::VarExpr> (vsae->target);
+	if (!mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		// This assumes that the parent of the expression is a ExprStmt
+		oss << "gpu_runtime::vertex_set_prepare_sparse(";
+		oss << mir_var->var.getName(); 
+		oss << ");" << std::endl;
+		printIndent();
+		oss << mir_var->var.getName() << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
+		printIndent();
+	}
+	oss << "gpu_runtime::vertex_set_apply_kernel<"; 
+	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		oss << "gpu_runtime::AccessorAll";
+	} else {
+		oss << "gpu_runtime::AccessorSparse";
+	}
+	oss << ", ";
+	oss << vsae->input_function->function_name->name << ">";
+	oss << "<<<NUM_CTA, CTA_SIZE>>>";
+	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		auto associated_element_type = mir_context_->getElementTypeFromVectorOrSetName(mir_var->var.getName());
+		assert(associated_element_type != nullptr);
+		//auto associated_element_type_size = mir_context_->getElementCount(associated_element_type);
+		//assert(associated_element_type_size != nullptr);
+		auto associated_edge_set = mir_context_->getEdgeSetFromElementType(associated_element_type);
+		assert(associated_edge_set != nullptr);
+		oss << "(";
+		//associated_element_type_size->accept(this);
+		oss << "__host_" << associated_edge_set->name << ".getFullFrontier()";
+		oss << ")";	
+	} else {
+		oss << "(";
+		oss << mir_var->var.getName();
+		oss << ")";
+	}		
+}
+void CodeGenGPUFusedKernel::visit(mir::VertexSetApplyExpr::Ptr vsae) {
+	auto mir_var = mir::to<mir::VarExpr> (vsae->target);
+	if (!mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		// This assumes that the parent of the expression is a ExprStmt
+		oss << "gpu_runtime::vertex_set_prepare_sparse_device(";
+		oss << var_name(mir_var->var.getName());
+		oss << ");" << std::endl;
+		printIndent();
+		oss << var_name(mir_var->var.getName()) << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
+		printIndent();
+	}
+	oss << "gpu_runtime::vertex_set_apply<"; 
+	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		oss << "gpu_runtime::AccessorAll";
+	} else {
+		oss << "gpu_runtime::AccessorSparse";
+	}
+	oss << ", ";
+	oss << vsae->input_function->function_name->name << ">";
+	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		auto associated_element_type = mir_context_->getElementTypeFromVectorOrSetName(mir_var->var.getName());
+		assert(associated_element_type != nullptr);
+		auto associated_edge_set = mir_context_->getEdgeSetFromElementType(associated_element_type);
+		assert(associated_edge_set != nullptr);
+		oss << "(";
+		oss << var_name(associated_edge_set->name) << ".getFullFrontier()";
+		oss << ")";	
+	} else {
+		oss << "(";
+		oss << var_name(mir_var->var.getName());
+		oss << ")";
+	}		
+	oss << ";" << std::endl;
+	printIndent();
+	oss << "_grid.sync()";
+	
+}
+void CodeGenGPU::visit(mir::VertexSetAllocExpr::Ptr vsae) {
+	mir::Expr::Ptr size_expr = mir_context_->getElementCount(vsae->element_type);
+	oss << "gpu_runtime::create_new_vertex_set(";
+	size_expr->accept(this);
+	oss << ", ";
+	if (vsae->size_expr == nullptr)
+		oss << "0";
+	else
+		vsae->size_expr->accept(this);
+	oss << ")";
+}
+void CodeGenGPU::generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare) {
+	printIndent();
+	mir::Var target = mir::to<mir::VarExpr>(tare->target)->var;
+	std::string var_name = target.getName();
+	oss << "cudaMemcpy(__host_" << var_name << " + ";
+	tare->index->accept(this);
+	oss << ", __device_" << var_name << " + ";
+	tare->index->accept(this);
+	oss << ", sizeof(";
+	mir::to<mir::VectorType>(target.getType())->vector_element_type->accept(this);
+	oss << "), cudaMemcpyDeviceToHost);" << std::endl;	
+	
+}
+void CodeGenGPU::generateHostToDeviceCopy(mir::TensorArrayReadExpr::Ptr tare) {
+	printIndent();
+	mir::Var target = mir::to<mir::VarExpr>(tare->target)->var;
+	std::string var_name = target.getName();
+	oss << "cudaMemcpy(__device_" << var_name << " + ";
+	tare->index->accept(this);
+	oss << ", __host_" << var_name << " + ";
+	tare->index->accept(this);
+	oss << ", sizeof(";
+	mir::to<mir::VectorType>(target.getType())->vector_element_type->accept(this);
+	oss << "), cudaMemcpyHostToDevice);" << std::endl;	
+}
+void CodeGenGPUHost::visit(mir::StmtBlock::Ptr stmt_block) {
+	for (auto stmt: *(stmt_block->stmts)) {
+		ExtractReadWriteSet extractor(mir_context_);
+		stmt->accept(&extractor);
+		for (auto tare: extractor.read_set) {
+			generateDeviceToHostCopy(tare);
+		}			
+		stmt->accept(this);
+		for (auto tare: extractor.write_set) {
+			generateHostToDeviceCopy(tare);
+		}
+	}
+}
+void CodeGenGPU::visit(mir::HybridGPUStmt::Ptr stmt) {
+	if (stmt->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE) {
+		printIndent();
+		oss << "if (gpu_runtime::builtin_getVertexSetSize(" << stmt->input_frontier_name << ") < " << stmt->input_frontier_name << ".max_num_elems * ";
+		oss << stmt->threshold_var_name;
+		oss << ") {" << std::endl;
+		indent();
+		stmt->stmt1->accept(this);
+		dedent();
+		printIndent();
+		oss << "} else {" << std::endl;
+		indent();	
+		stmt->stmt2->accept(this);
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;	
+	} else {
+		assert(false && "Invalid criteria for Hybrid Statement\n");
+	}
+}
+void CodeGenGPUFusedKernel::visit(mir::HybridGPUStmt::Ptr stmt) {
+	if (stmt->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE) {
+		printIndent();
+		oss << "if (gpu_runtime::device_builtin_getVertexSetSize(" << var_name(stmt->input_frontier_name) << ") < " << var_name(stmt->input_frontier_name) << ".max_num_elems * ";
+		oss << "__device_" << stmt->threshold_var_name;
+		oss << ") {" << std::endl;
+		indent();
+		stmt->stmt1->accept(this);
+		dedent();
+		printIndent();
+		oss << "} else {" << std::endl;
+		indent();	
+		stmt->stmt2->accept(this);
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;	
+	} else {
+		assert(false && "Invalid criteria for Hybrid Statement\n");
+	}
+}
+
+void CodeGenGPU::visit(mir::VertexSetWhereExpr::Ptr expr) {
+	assert(false && "VertexSetWhereExpr should be handled in AssignStmt");
+}
+
+}
diff --git a/src/backend/codegen_gpu/extract_read_write_set.cpp b/src/backend/codegen_gpu/extract_read_write_set.cpp
new file mode 100644
index 00000000..976e561e
--- /dev/null
+++ b/src/backend/codegen_gpu/extract_read_write_set.cpp
@@ -0,0 +1,36 @@
+#include "graphit/backend/codegen_gpu/extract_read_write_set.h"
+
+namespace graphit {
+void ExtractReadWriteSet::visit(mir::StmtBlock::Ptr stmt_block) {
+	return;
+}
+void ExtractReadWriteSet::visit(mir::TensorArrayReadExpr::Ptr tare) {
+	mir::Var target = mir::to<mir::VarExpr>(tare->target)->var;
+	if (mir_context->isLoweredConstTensor(target.getName())) {
+		add_read(tare);
+	}
+	tare->index->accept(this);
+}
+void ExtractReadWriteSet::visit(mir::AssignStmt::Ptr assign_stmt) {
+	if (mir::isa<mir::TensorArrayReadExpr>(assign_stmt->lhs)) {
+		mir::TensorArrayReadExpr::Ptr tare = mir::to<mir::TensorArrayReadExpr>(assign_stmt->lhs);
+		mir::Var target = mir::to<mir::VarExpr>(tare->target)->var;
+		if (mir_context->isLoweredConstTensor(target.getName())) {
+			add_write(tare);
+		tare->index->accept(this);
+		assign_stmt->expr->accept(this);
+	}
+	tare->index->accept(this);
+		
+	} else {
+		assign_stmt->lhs->accept(this);
+		assign_stmt->expr->accept(this);
+	}
+}
+void ExtractReadWriteSet::add_read(mir::TensorArrayReadExpr::Ptr tare) {
+	read_set_.push_back(tare);
+}
+void ExtractReadWriteSet::add_write(mir::TensorArrayReadExpr::Ptr tare) {
+	write_set_.push_back(tare);
+}
+}
diff --git a/src/frontend/parser.cpp b/src/frontend/parser.cpp
index deef25bc..767173bd 100644
--- a/src/frontend/parser.cpp
+++ b/src/frontend/parser.cpp
@@ -2796,6 +2796,9 @@ namespace graphit {
         intrinsics_.push_back("append");
         intrinsics_.push_back("pop");
         intrinsics_.push_back("transpose");
+        intrinsics_.push_back("insert");
+        intrinsics_.push_back("retrieve");
+
 
         // set up function call intrinsics
         decls.insert("fabs", IdentType::FUNCTION);
diff --git a/src/graphitc.py b/src/graphitc.py
index d2530427..97a4cd3b 100644
--- a/src/graphitc.py
+++ b/src/graphitc.py
@@ -75,6 +75,7 @@ def parseArgs():
 
         compile_file.write("#include <graphit/frontend/high_level_schedule.h>\n")
         compile_file.write("namespace graphit {\n")
+        compile_file.write("using namespace graphit::fir::gpu_schedule;\n")
         compile_file.write("void user_defined_schedule (graphit::fir::high_level_schedule::ProgramScheduleNode::Ptr program) {\n")
         for schedule_cmd in schedule_cmd_list:
             compile_file.write(schedule_cmd)
diff --git a/src/main.cpp b/src/main.cpp
index 5b9f7fc5..47938b72 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -59,19 +59,12 @@ int main(int argc, char* argv[]) {
     std::string python_module_path = cli.python_module_path();
     
         
-    be->emitCPP(output_file, python_module_name);
+    if (program->backend_selection == fir::high_level_schedule::ProgramScheduleNode::backend_selection_type::CODEGEN_GPU)
+    	be->emitGPU(output_file, python_module_name);
+    else
+    	be->emitCPP(output_file, python_module_name);
     output_file.close();
-/*
-    if (python_module_name != "") {
-	if (python_module_path == "")
-		python_module_path = "/tmp";
-	std::ofstream python_output_file;
-	python_output_file.open(python_module_path + "/" + python_module_name + ".py");
-	be->emitPython(python_output_file, python_module_name, python_module_path) ;
-	python_output_file.close();
-	
-    }
-*/
+
     delete be;
     return 0;
 
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index 10430424..a7e27e67 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -35,6 +35,232 @@ namespace graphit {
 
         node = vertexset_apply;
     }
+    void ApplyExprLower::LowerApplyExpr::visit(mir::StmtBlock::Ptr stmt_block) {
+	std::vector<mir::Stmt::Ptr> new_stmts;
+	for (auto stmt: *(stmt_block->stmts)) {
+		new_stmts.push_back(rewrite<mir::Stmt>(stmt));
+		while (insert_after_stmt != nullptr) {
+			auto temp = insert_after_stmt;
+			insert_after_stmt = nullptr;	
+			temp = rewrite<mir::Stmt>(temp);
+			new_stmts.push_back(temp);
+		}
+	}
+	* (stmt_block->stmts) = new_stmts;
+	node = stmt_block;
+    }
+    void ApplyExprLower::LowerApplyExpr::visit(mir::VarDecl::Ptr var_decl) {
+	if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+		if (mir::isa<mir::EdgeSetApplyExpr> (var_decl->initVal) || mir::isa<mir::VertexSetWhereExpr>(var_decl->initVal)) {
+			auto init_val = var_decl->initVal;
+			var_decl->initVal = nullptr;
+			mir::AssignStmt::Ptr assign_stmt = std::make_shared<mir::AssignStmt>();
+			assign_stmt->expr = init_val;
+			mir::VarExpr::Ptr var_expr = std::make_shared<mir::VarExpr>();
+			mir::Var var (var_decl->name, var_decl->type);
+			var_expr->var = var;
+			assign_stmt->lhs = var_expr;
+			assign_stmt->stmt_label = var_decl->stmt_label;
+			insert_after_stmt = assign_stmt;
+			node = var_decl;
+			return;	
+		} 
+	}
+	MIRRewriter::visit(var_decl);
+	var_decl = mir::to<mir::VarDecl>(node);
+	node = var_decl;
+    }    
+    void ApplyExprLower::LowerApplyExpr::visit(mir::AssignStmt::Ptr assign_stmt) {
+	
+        if (assign_stmt->stmt_label != "") {
+                label_scope_.scope(assign_stmt->stmt_label);
+        }
+		
+	// Check for Hybrid stmt
+	if (mir::isa<mir::EdgeSetApplyExpr> (assign_stmt->expr)) {
+		mir::EdgeSetApplyExpr::Ptr edgeset_apply = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
+		if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+			auto current_scope_name = label_scope_.getCurrentScope();
+			auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
+			if (apply_schedule_iter != schedule_->apply_gpu_schedules.end()) {
+				auto apply_schedule = apply_schedule_iter->second;
+				if (dynamic_cast<fir::gpu_schedule::HybridGPUSchedule*>(apply_schedule) != nullptr) {	
+					fir::gpu_schedule::HybridGPUSchedule *hybrid_schedule = dynamic_cast<fir::gpu_schedule::HybridGPUSchedule*>(apply_schedule);	
+					// This EdgeSetApply has a Hybrid Schedule attached to it
+					// Create the first Stmt block
+					mir::StmtBlock::Ptr stmt_block_1 = std::make_shared<mir::StmtBlock>();	
+					mir::AssignStmt::Ptr stmt1 = std::make_shared<mir::AssignStmt>();
+					stmt1->lhs = assign_stmt->lhs;
+					stmt1->expr = assign_stmt->expr;
+					stmt1->stmt_label = "hybrid1";	
+					stmt_block_1->insertStmtEnd(stmt1);
+					fir::gpu_schedule::SimpleGPUSchedule * schedule1 = new fir::gpu_schedule::SimpleGPUSchedule();
+					*schedule1 = hybrid_schedule->s1;
+					schedule_->apply_gpu_schedules[current_scope_name + ":hybrid1"] = schedule1;
+					stmt_block_1 = rewrite<mir::StmtBlock>(stmt_block_1);
+					
+					// Now create the second Stmt block
+				        auto func_decl = mir_context_->getFunction(edgeset_apply->input_function->function_name->name);
+				        mir::FuncDecl::Ptr func_decl_v2 = func_decl->clone<mir::FuncDecl>();
+				        func_decl_v2->name = func_decl->name + "_v2"; 
+				        mir_context_->addFunctionFront(func_decl_v2);
+					mir::StmtBlock::Ptr stmt_block_2 = std::make_shared<mir::StmtBlock>();
+					mir::AssignStmt::Ptr stmt2 = std::make_shared<mir::AssignStmt>();
+					stmt2->lhs = assign_stmt->lhs;
+					stmt2->expr = assign_stmt->expr;
+						
+					mir::FuncExpr::Ptr new_func_expr = std::make_shared<mir::FuncExpr>();
+					new_func_expr->function_name = std::make_shared<mir::IdentDecl>();
+					new_func_expr->function_name->name = func_decl_v2->name;
+
+
+					mir::to<mir::EdgeSetApplyExpr>(stmt2->expr)->input_function= new_func_expr;
+					stmt2->stmt_label = "hybrid2";
+					stmt_block_2->insertStmtEnd(stmt2);
+					fir::gpu_schedule::SimpleGPUSchedule * schedule2 = new fir::gpu_schedule::SimpleGPUSchedule();
+					*schedule2 = hybrid_schedule->s2;
+					schedule_->apply_gpu_schedules[current_scope_name + ":hybrid2"] = schedule2;
+					stmt_block_2 = rewrite<mir::StmtBlock>(stmt_block_2);
+					
+					// Finally create a hybrid statement and replace - 
+					mir::HybridGPUStmt::Ptr hybrid_node = std::make_shared<mir::HybridGPUStmt>();
+					hybrid_node->stmt1 = stmt_block_1;
+					hybrid_node->stmt2 = stmt_block_2;
+					hybrid_node->threshold = hybrid_schedule->threshold;
+					hybrid_node->argv_index = hybrid_schedule->argv_index;
+					hybrid_node->criteria = hybrid_schedule->_hybrid_criteria;
+					if (hybrid_node->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE && edgeset_apply->from_func->function_name->name != "") {
+						hybrid_node->input_frontier_name = edgeset_apply->from_func->function_name->name;	
+					} else {
+						assert(false && "Invalid criteria for Hybrid Node\n");
+					}
+					
+					node = hybrid_node;
+					mir_context_->hybrid_gpu_stmts.push_back(hybrid_node);
+					if (assign_stmt->stmt_label != "") {
+						label_scope_.unscope();
+					}
+					return;
+								
+				}
+			}
+		}
+	}
+        if (assign_stmt->stmt_label != "") {
+                label_scope_.unscope();
+        }
+
+
+        MIRRewriter::visit(assign_stmt);
+	assign_stmt = mir::to<mir::AssignStmt>(node);
+	if (mir::isa<mir::EdgeSetApplyExpr> (assign_stmt->expr)) {
+		mir::EdgeSetApplyExpr::Ptr edgeset_apply = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
+		if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty() && edgeset_apply->enable_deduplication == true && edgeset_apply->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+			if (edgeset_apply->applied_schedule.deduplication_strategy == fir::gpu_schedule::SimpleGPUSchedule::deduplication_strategy_type::DEDUP_FUSED) {
+				edgeset_apply->fused_dedup = true;
+				edgeset_apply->fused_dedup_perfect = true;
+			} else {
+				mir::VertexSetDedupExpr::Ptr dedup_expr = std::make_shared<mir::VertexSetDedupExpr>();
+				mir::ExprStmt::Ptr expr_stmt = std::make_shared<mir::ExprStmt>();
+				dedup_expr->target = assign_stmt->lhs;	
+				expr_stmt->expr = dedup_expr;
+				insert_after_stmt = expr_stmt;
+				dedup_expr->perfect_dedup = true;
+				edgeset_apply->fused_dedup = false;
+			}
+		} else if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty() && edgeset_apply->applied_schedule.deduplication == fir::gpu_schedule::SimpleGPUSchedule::deduplication_type::DEDUP_ENABLED && edgeset_apply->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+			if (edgeset_apply->applied_schedule.deduplication_strategy == fir::gpu_schedule::SimpleGPUSchedule::deduplication_strategy_type::DEDUP_FUSED) {
+				edgeset_apply->fused_dedup = true;
+				edgeset_apply->fused_dedup_perfect = false;
+			} else {
+				mir::VertexSetDedupExpr::Ptr dedup_expr = std::make_shared<mir::VertexSetDedupExpr>();
+				mir::ExprStmt::Ptr expr_stmt = std::make_shared<mir::ExprStmt>();
+				dedup_expr->target = assign_stmt->lhs;	
+				expr_stmt->expr = dedup_expr;
+				insert_after_stmt = expr_stmt;
+				dedup_expr->perfect_dedup = false;
+				edgeset_apply->fused_dedup = false;
+			}
+		}
+	}
+	node = assign_stmt;
+    }
+    void ApplyExprLower::LowerApplyExpr::visit(mir::ExprStmt::Ptr expr_stmt) {
+        if (expr_stmt->stmt_label != "") {
+                label_scope_.scope(expr_stmt->stmt_label);
+        }
+	if (mir::isa<mir::EdgeSetApplyExpr> (expr_stmt->expr)) {
+		mir::EdgeSetApplyExpr::Ptr edgeset_apply = mir::to<mir::EdgeSetApplyExpr>(expr_stmt->expr);
+		if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+			auto current_scope_name = label_scope_.getCurrentScope();
+			auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
+			if (apply_schedule_iter != schedule_->apply_gpu_schedules.end()) {
+				auto apply_schedule = apply_schedule_iter->second;
+				if (dynamic_cast<fir::gpu_schedule::HybridGPUSchedule*>(apply_schedule) != nullptr) {	
+					fir::gpu_schedule::HybridGPUSchedule *hybrid_schedule = dynamic_cast<fir::gpu_schedule::HybridGPUSchedule*>(apply_schedule);	
+					// This EdgeSetApply has a Hybrid Schedule attached to it
+					// Create the first Stmt block
+					mir::StmtBlock::Ptr stmt_block_1 = std::make_shared<mir::StmtBlock>();	
+					mir::ExprStmt::Ptr stmt1 = std::make_shared<mir::ExprStmt>();
+					stmt1->expr = expr_stmt->expr;
+					stmt1->stmt_label = "hybrid1";	
+					stmt_block_1->insertStmtEnd(stmt1);
+					fir::gpu_schedule::SimpleGPUSchedule * schedule1 = new fir::gpu_schedule::SimpleGPUSchedule();
+					*schedule1 = hybrid_schedule->s1;
+					schedule_->apply_gpu_schedules[current_scope_name + ":hybrid1"] = schedule1;
+					stmt_block_1 = rewrite<mir::StmtBlock>(stmt_block_1);
+					
+					// Now create the second Stmt block
+				        auto func_decl = mir_context_->getFunction(edgeset_apply->input_function->function_name->name);
+				        mir::FuncDecl::Ptr func_decl_v2 = func_decl->clone<mir::FuncDecl>();
+				        func_decl_v2->name = func_decl->name + "_v2"; 
+				        mir_context_->addFunctionFront(func_decl_v2);
+					mir::StmtBlock::Ptr stmt_block_2 = std::make_shared<mir::StmtBlock>();
+					mir::ExprStmt::Ptr stmt2 = std::make_shared<mir::ExprStmt>();
+					stmt2->expr = expr_stmt->expr;
+
+					mir::FuncExpr::Ptr new_func_expr = std::make_shared<mir::FuncExpr>();
+					new_func_expr->function_name = std::make_shared<mir::IdentDecl>();
+					new_func_expr->function_name->name = func_decl_v2->name;
+
+					mir::to<mir::EdgeSetApplyExpr>(stmt2->expr)->input_function = new_func_expr;
+					stmt2->stmt_label = "hybrid2";
+					stmt_block_2->insertStmtEnd(stmt2);
+					fir::gpu_schedule::SimpleGPUSchedule * schedule2 = new fir::gpu_schedule::SimpleGPUSchedule();
+					*schedule2 = hybrid_schedule->s2;
+					schedule_->apply_gpu_schedules[current_scope_name + ":hybrid2"] = schedule2;
+					stmt_block_2 = rewrite<mir::StmtBlock>(stmt_block_2);
+					
+					// Finally create a hybrid statement and replace - 
+					mir::HybridGPUStmt::Ptr hybrid_node = std::make_shared<mir::HybridGPUStmt>();
+					hybrid_node->stmt1 = stmt_block_1;
+					hybrid_node->stmt2 = stmt_block_2;
+					hybrid_node->threshold = hybrid_schedule->threshold;
+					hybrid_node->argv_index = hybrid_schedule->argv_index;
+					hybrid_node->criteria = hybrid_schedule->_hybrid_criteria;
+					if (hybrid_node->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE && edgeset_apply->from_func->function_name->name != "") {
+						hybrid_node->input_frontier_name = edgeset_apply->from_func->function_name->name;	
+					} else {
+						assert(false && "Invalid criteria for Hybrid Node\n");
+					}
+					
+					node = hybrid_node;
+					mir_context_->hybrid_gpu_stmts.push_back(hybrid_node);
+					if (expr_stmt->stmt_label != "") {
+						label_scope_.unscope();
+					}
+					return;
+					
+				}
+			}
+		}
+	}
+        if (expr_stmt->stmt_label != "") {
+                label_scope_.unscope();
+        }
+        MIRRewriter::visit(expr_stmt);
+	node = expr_stmt;
+    }
 
     void ApplyExprLower::LowerApplyExpr::visit(mir::EdgeSetApplyExpr::Ptr edgeset_apply) {
         // use the target var expressionto figure out the edgeset type
@@ -49,6 +275,43 @@ namespace graphit {
             edgeset_apply->is_weighted = true;
         }
 
+
+
+	// First check if the program has a GPU Schedule, if yes, the defaults are different
+	if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+		// Always parallelize all operators for GPU schedules
+		edgeset_apply->is_parallel = true;
+		if (edgeset_apply->tracking_field != "")
+			edgeset_apply->requires_output = true;
+		// Check if there is a GPU schedule attached to this statement - 
+            	auto current_scope_name = label_scope_.getCurrentScope();
+		auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
+		if (apply_schedule_iter != schedule_->apply_gpu_schedules.end()) {
+			auto apply_schedule = apply_schedule_iter->second;
+			if (dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule) != nullptr) {	
+				edgeset_apply->applied_schedule = *dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule);
+			} else {
+				assert(false && "Schedule applied to EdgeSetApply must be a Simple Schedule");
+			}
+			if (edgeset_apply->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PUSH)
+				node = std::make_shared<mir::PushEdgeSetApplyExpr>(edgeset_apply);
+			else if (edgeset_apply->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {
+				node = std::make_shared<mir::PullEdgeSetApplyExpr>(edgeset_apply);
+				mir_context_->graphs_with_transpose[mir::to<mir::VarExpr>(edgeset_apply->target)->var.getName()] = true;
+			} else 
+				assert(false && "Invalid option for direction\n");
+			
+			if (edgeset_apply->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY && edgeset_apply->applied_schedule.edge_blocking == fir::gpu_schedule::SimpleGPUSchedule::edge_blocking_type::BLOCKED) {
+				mir_context_->graphs_with_blocking[mir::to<mir::VarExpr>(edgeset_apply->target)->var.getName()] = edgeset_apply->applied_schedule.edge_blocking_size;
+			}
+						
+		} else {
+			// No schedule is attached, lower using default schedule	
+			node = std::make_shared<mir::PushEdgeSetApplyExpr>(edgeset_apply);			
+		}
+		return;
+	}
+
         // check if the schedule contains entry for the current edgeset apply expressions
         if (schedule_ != nullptr && schedule_->apply_schedules != nullptr) {
 
diff --git a/src/midend/atomics_op_lower.cpp b/src/midend/atomics_op_lower.cpp
index 01277db9..5924d23a 100644
--- a/src/midend/atomics_op_lower.cpp
+++ b/src/midend/atomics_op_lower.cpp
@@ -23,7 +23,7 @@ void graphit::AtomicsOpLower::ApplyExprVisitor::visit(graphit::mir::UpdatePriori
 
 void graphit::AtomicsOpLower::ApplyExprVisitor::visit(graphit::mir::HybridDenseEdgeSetApplyExpr::Ptr apply_expr) {
     if (apply_expr->is_parallel){
-        ReduceStmtLower reduce_stmt_lower = ReduceStmtLower(mir_context_);
+        ReduceStmtLower reduce_stmt_lower = ReduceStmtLower(mir_context_, schedule_);
         auto pull_func_name = apply_expr->input_function->function_name->name;
         mir::FuncDecl::Ptr pull_func_decl = mir_context_->getFunction(pull_func_name);
         auto push_func_name = apply_expr->push_function_->function_name->name;
@@ -40,7 +40,7 @@ void graphit::AtomicsOpLower::ApplyExprVisitor::visit(graphit::mir::HybridDenseE
 
 void graphit::AtomicsOpLower::ApplyExprVisitor::singleFunctionEdgeSetApplyExprAtomicsLower(graphit::mir::EdgeSetApplyExpr::Ptr apply_expr){
     if (apply_expr->is_parallel){
-        ReduceStmtLower reduce_stmt_lower = ReduceStmtLower(mir_context_);
+        ReduceStmtLower reduce_stmt_lower = ReduceStmtLower(mir_context_, schedule_);
         auto apply_func_decl_name = apply_expr->input_function->function_name->name;
         mir::FuncDecl::Ptr apply_func_decl = mir_context_->getFunction(apply_func_decl_name);
         apply_func_decl->accept(&reduce_stmt_lower);
@@ -289,13 +289,16 @@ void graphit::AtomicsOpLower::ReduceStmtLower::visit(graphit::mir::ReduceStmt::P
                         break;
                     default:
                         std::cout << "not supported for atomics" << std::endl;
-                        exit(0);
+			assert(false);
                 }
             }
         }
 
         //If it is local vector, we still need to add atomic
-        else if(mir::isa<mir::VectorType>(local_vector_field_type)) {
+	// This is definitely a bug. Not all local vectors require an atomic access
+	// It also seems that the mechanism to check if the acccess is atomic seems to be broken. 
+	// This will just add atomics everywhere
+        else if(!(schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) && mir::isa<mir::VectorType>(local_vector_field_type)) {
             mir::VectorType::Ptr vector_type = mir::to<mir::VectorType>(local_vector_field_type);
             mir::Type::Ptr local_field_type = vector_type->vector_element_type;
 
@@ -316,7 +319,7 @@ void graphit::AtomicsOpLower::ReduceStmtLower::visit(graphit::mir::ReduceStmt::P
                                 break;
                             default:
                                 std::cout << "not supported for atomics" << std::endl;
-                                exit(0);
+				assert(false);
                         }
                     }
 
diff --git a/src/midend/frontier_reuse_analysis.cpp b/src/midend/frontier_reuse_analysis.cpp
new file mode 100644
index 00000000..8701fdba
--- /dev/null
+++ b/src/midend/frontier_reuse_analysis.cpp
@@ -0,0 +1,87 @@
+#include <graphit/midend/frontier_reuse_analysis.h>
+
+namespace graphit {
+void FrontierReuseAnalysis::analyze(void) {
+	for (auto func: mir_context_->getFunctionList()) {
+		ReuseFindingVisitor visitor(mir_context_);
+		func->accept(&visitor);	
+	}
+}
+bool FrontierReuseAnalysis::ReuseFindingVisitor::is_frontier_reusable(mir::StmtBlock::Ptr stmt_block, int index, std::string frontier_name) {
+	FrontierUseFinder finder;
+	finder.frontier_name = frontier_name;
+	index++;
+	for (int i = index; i < stmt_block->stmts->size(); i++) {
+		if (mir::isa<mir::ExprStmt>((*(stmt_block->stmts))[i])) {
+			mir::ExprStmt::Ptr expr_stmt = mir::to<mir::ExprStmt>((*(stmt_block->stmts))[i]);
+			if (mir::isa<mir::Call>(expr_stmt->expr)) {
+				mir::Call::Ptr call_expr = mir::to<mir::Call>(expr_stmt->expr);
+				if (call_expr->name == "deleteObject" && mir::isa<mir::VarExpr>(call_expr->args[0]) && mir::to<mir::VarExpr>(call_expr->args[0])->var.getName() == frontier_name) {
+					to_deletes.push_back(expr_stmt);
+					return true;
+				}
+			}	
+		}	
+		(*(stmt_block->stmts))[i]->accept(&finder);
+		if (finder.is_used)
+			return false;
+	}
+	return false;
+}
+void FrontierReuseAnalysis::ReuseFindingVisitor::visit(mir::StmtBlock::Ptr stmt_block) {
+	std::vector<mir::Stmt::Ptr> new_stmts;	
+	to_deletes.clear();
+	for (int i = 0; i < stmt_block->stmts->size(); i++) {
+		mir::Stmt::Ptr this_stmt = (*(stmt_block->stmts))[i];
+		if (mir::isa<mir::AssignStmt>(this_stmt)) {
+			mir::AssignStmt::Ptr assign_stmt = mir::to<mir::AssignStmt>(this_stmt);
+			if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
+				mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
+				if (esae->from_func && esae->from_func->function_name->name != "" && !mir_context_->isFunction(esae->from_func->function_name->name)) {
+					std::string frontier_name = esae->from_func->function_name->name;
+					if (is_frontier_reusable(stmt_block, i, frontier_name)) {
+						esae->frontier_reusable = true;
+					}
+				}
+			}
+		} else if (mir::isa<mir::VarDecl>(this_stmt)) {
+			mir::VarDecl::Ptr var_decl = mir::to<mir::VarDecl>(this_stmt);
+			if (var_decl->initVal != nullptr) {
+				if (mir::isa<mir::EdgeSetApplyExpr>(var_decl->initVal)) {
+					mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(var_decl->initVal);
+					if (esae->from_func && esae->from_func->function_name->name != "" && !mir_context_->isFunction(esae->from_func->function_name->name)) {
+						std::string frontier_name = esae->from_func->function_name->name;
+						if (is_frontier_reusable(stmt_block, i, frontier_name)) {
+							esae->frontier_reusable = true;
+						}
+					}
+				}	
+			}
+		}
+		if (std::find(to_deletes.begin(), to_deletes.end(), this_stmt) == to_deletes.end()) {
+			new_stmts.push_back(this_stmt);	
+		}
+	}	
+	(*(stmt_block->stmts)) = new_stmts;
+	mir::MIRVisitor::visit(stmt_block);
+}
+void FrontierReuseAnalysis::FrontierUseFinder::visit(mir::VarExpr::Ptr var_expr) {
+	if (var_expr->var.getName() == frontier_name)
+		is_used = true;
+}
+void FrontierReuseAnalysis::FrontierUseFinder::visit(mir::PushEdgeSetApplyExpr::Ptr pesae) {
+	mir::MIRVisitor::visit(pesae);
+	if (pesae->from_func->function_name->name == frontier_name)
+		is_used = true;
+}
+void FrontierReuseAnalysis::FrontierUseFinder::visit(mir::PullEdgeSetApplyExpr::Ptr pesae) {
+	mir::MIRVisitor::visit(pesae);
+	if (pesae->from_func->function_name->name == frontier_name)
+		is_used = true;
+}
+void FrontierReuseAnalysis::FrontierUseFinder::visit(mir::EdgeSetApplyExpr::Ptr esae) {
+	mir::MIRVisitor::visit(esae);
+	if (esae->from_func->function_name->name == frontier_name)
+		is_used = true;
+}
+}
diff --git a/src/midend/gpu_change_tracking_lower.cpp b/src/midend/gpu_change_tracking_lower.cpp
new file mode 100644
index 00000000..226e9809
--- /dev/null
+++ b/src/midend/gpu_change_tracking_lower.cpp
@@ -0,0 +1,179 @@
+#include <graphit/midend/gpu_change_tracking_lower.h>
+
+namespace graphit {
+void GPUChangeTrackingLower::lower(void) {
+	UdfArgChangeVisitor visitor(mir_context_);
+	for (auto func: mir_context_->getFunctionList()) {
+		func->accept(&visitor);
+	}
+}
+void GPUChangeTrackingLower::UdfArgChangeVisitor::updateUdf(mir::FuncDecl::Ptr func_decl, mir::EdgeSetApplyExpr::Ptr esae) {
+	if (esae->requires_output == false)
+		return;
+
+	mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(esae->target);	
+	mir::EdgeSetType::Ptr edge_set_type = mir::to<mir::EdgeSetType>(var_expr->var.getType());
+	mir::ElementType::Ptr element_type = (*(edge_set_type->vertex_element_type_list))[0];
+	mir::VertexSetType::Ptr vertex_set_type = std::make_shared<mir::VertexSetType>();
+	vertex_set_type->element = element_type;
+	
+	mir::Var new_arg("__output_frontier", vertex_set_type);
+	func_decl->args.push_back(new_arg);
+	
+	// Now modify all the reduce stmts inside
+	ReductionOpChangeVisitor visitor(mir_context_, esae->tracking_field, esae, vertex_set_type);
+	func_decl->accept(&visitor);
+}
+void GPUChangeTrackingLower::UdfArgChangeVisitor::visit(mir::PushEdgeSetApplyExpr::Ptr pesae) {
+	mir::FuncDecl::Ptr func_decl = mir_context_->getFunction(pesae->input_function->function_name->name);	
+	updateUdf(func_decl, pesae);
+}
+void GPUChangeTrackingLower::UdfArgChangeVisitor::visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr pesae) {
+	mir::FuncDecl::Ptr func_decl = mir_context_->getFunction(pesae->input_function->function_name->name);	
+	updateUdf(func_decl, pesae);
+}
+void GPUChangeTrackingLower::UdfArgChangeVisitor::visit(mir::PullEdgeSetApplyExpr::Ptr pesae) {
+	mir::FuncDecl::Ptr func_decl = mir_context_->getFunction(pesae->input_function->function_name->name);	
+	updateUdf(func_decl, pesae);
+}
+
+void GPUChangeTrackingLower::ReductionOpChangeVisitor::visit(mir::StmtBlock::Ptr stmt_block) {
+	std::vector<mir::Stmt::Ptr> new_stmts;
+	for (auto stmt: *(stmt_block->stmts)) {
+		stmt->accept(this);
+		bool stmt_added = false;
+		if (mir::isa<mir::ReduceStmt>(stmt)) {
+			mir::ReduceStmt::Ptr reduce_stmt = mir::to<mir::ReduceStmt>(stmt);
+			if (mir::isa<mir::TensorReadExpr>(reduce_stmt->lhs)) {
+				mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(reduce_stmt->lhs);
+				if (mir::isa<mir::VarExpr>(tre->target) && mir::to<mir::VarExpr>(tre->target)->var.getName() == udf_tracking_var) {
+					std::string result_var_name = "result_var" + mir_context_->getUniqueNameCounterString();
+					reduce_stmt->tracking_var_name_ = result_var_name;
+					
+					mir::ScalarType::Ptr scalar_type = std::make_shared<mir::ScalarType>();
+					scalar_type->type = mir::ScalarType::Type::BOOL;
+					mir::BoolLiteral::Ptr bool_literal = std::make_shared<mir::BoolLiteral>();
+					bool_literal->val = false;
+					mir::VarDecl::Ptr decl_stmt = std::make_shared<mir::VarDecl>();
+					decl_stmt->name = result_var_name;
+					decl_stmt->type = scalar_type;
+					decl_stmt->initVal = bool_literal;
+					new_stmts.push_back(decl_stmt);
+					new_stmts.push_back(stmt);
+
+					// Now construct the conditional enqueue
+					mir::Var tracking_var(result_var_name, scalar_type);
+					mir::VarExpr::Ptr condition_expr = std::make_shared<mir::VarExpr>();
+					condition_expr->var = tracking_var;
+					mir::IfStmt::Ptr if_stmt = std::make_shared<mir::IfStmt>();
+					if_stmt->cond = condition_expr;
+					
+					mir::StmtBlock::Ptr stmt_block = std::make_shared<mir::StmtBlock>();
+					if_stmt->ifBody = stmt_block;
+					
+					mir::EnqueueVertex::Ptr enqueue_vertex = std::make_shared<mir::EnqueueVertex>();
+					mir::Var frontier_var("__output_frontier", frontier_type);
+					mir::VarExpr::Ptr frontier_expr = std::make_shared<mir::VarExpr>();
+					frontier_expr->var = frontier_var;
+					enqueue_vertex->vertex_id = tre->index;
+					enqueue_vertex->vertex_frontier = frontier_expr;	
+					enqueue_vertex->fused_dedup = current_edge_set_apply_expr->fused_dedup;
+					enqueue_vertex->fused_dedup_perfect = current_edge_set_apply_expr->fused_dedup_perfect;
+					if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::SPARSE;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BOOLMAP;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BITMAP;
+					}
+					stmt_block->insertStmtEnd(enqueue_vertex);
+					if_stmt->elseBody = nullptr;
+					new_stmts.push_back(if_stmt);
+					stmt_added = true;
+				}
+			}
+		} else if (mir::isa<mir::CompareAndSwapStmt>(stmt)) {
+			mir::CompareAndSwapStmt::Ptr cas_stmt = mir::to<mir::CompareAndSwapStmt>(stmt);
+			if (mir::isa<mir::TensorReadExpr>(cas_stmt->lhs)) {
+				mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(cas_stmt->lhs);
+				if (mir::isa<mir::VarExpr>(tre->target) && mir::to<mir::VarExpr>(tre->target)->var.getName() == udf_tracking_var) {
+					std::string result_var_name = "result_var" + mir_context_->getUniqueNameCounterString();
+					cas_stmt->tracking_var_ = result_var_name;
+					
+					mir::ScalarType::Ptr scalar_type = std::make_shared<mir::ScalarType>();
+					scalar_type->type = mir::ScalarType::Type::BOOL;
+					mir::BoolLiteral::Ptr bool_literal = std::make_shared<mir::BoolLiteral>();
+					bool_literal->val = false;
+					mir::VarDecl::Ptr decl_stmt = std::make_shared<mir::VarDecl>();
+					decl_stmt->name = result_var_name;
+					decl_stmt->type = scalar_type;
+					decl_stmt->initVal = bool_literal;
+					new_stmts.push_back(decl_stmt);
+					new_stmts.push_back(stmt);
+
+					// Now construct the conditional enqueue
+					mir::Var tracking_var(result_var_name, scalar_type);
+					mir::VarExpr::Ptr condition_expr = std::make_shared<mir::VarExpr>();
+					condition_expr->var = tracking_var;
+					mir::IfStmt::Ptr if_stmt = std::make_shared<mir::IfStmt>();
+					if_stmt->cond = condition_expr;
+					
+					mir::StmtBlock::Ptr stmt_block = std::make_shared<mir::StmtBlock>();
+					if_stmt->ifBody = stmt_block;
+					
+					mir::EnqueueVertex::Ptr enqueue_vertex = std::make_shared<mir::EnqueueVertex>();
+					mir::Var frontier_var("__output_frontier", frontier_type);
+					mir::VarExpr::Ptr frontier_expr = std::make_shared<mir::VarExpr>();
+					frontier_expr->var = frontier_var;
+					enqueue_vertex->vertex_id = tre->index;
+					enqueue_vertex->vertex_frontier = frontier_expr;	
+					enqueue_vertex->fused_dedup = current_edge_set_apply_expr->fused_dedup;
+					enqueue_vertex->fused_dedup_perfect = current_edge_set_apply_expr->fused_dedup_perfect;
+					if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::SPARSE;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BOOLMAP;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BITMAP;
+					}
+					stmt_block->insertStmtEnd(enqueue_vertex);
+					if_stmt->elseBody = nullptr;
+					new_stmts.push_back(if_stmt);
+					stmt_added = true;
+				}
+				
+			}		
+		} else if (mir::isa<mir::AssignStmt>(stmt)) {
+			mir::AssignStmt::Ptr assign_stmt = mir::to<mir::AssignStmt>(stmt);
+			if (mir::isa<mir::TensorReadExpr>(assign_stmt->lhs)) {
+				mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(assign_stmt->lhs);
+				if (mir::isa<mir::VarExpr>(tre->target) && mir::to<mir::VarExpr>(tre->target)->var.getName() == udf_tracking_var) {
+					new_stmts.push_back(stmt);
+					mir::EnqueueVertex::Ptr enqueue_vertex = std::make_shared<mir::EnqueueVertex>();
+					mir::Var frontier_var("__output_frontier", frontier_type);
+					mir::VarExpr::Ptr frontier_expr = std::make_shared<mir::VarExpr>();
+					frontier_expr->var = frontier_var;
+					enqueue_vertex->vertex_id = tre->index;
+					enqueue_vertex->vertex_frontier = frontier_expr;	
+					enqueue_vertex->fused_dedup = current_edge_set_apply_expr->fused_dedup;
+					enqueue_vertex->fused_dedup_perfect = current_edge_set_apply_expr->fused_dedup_perfect;
+					if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::SPARSE;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BOOLMAP;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BITMAP;
+					}
+					new_stmts.push_back(enqueue_vertex);
+					stmt_added = true;
+				}
+			}
+			
+		}
+		if (!stmt_added)
+			new_stmts.push_back(stmt);
+	}
+	*(stmt_block->stmts) = new_stmts;
+}
+
+}
diff --git a/src/midend/gpu_priority_features_lowering.cpp b/src/midend/gpu_priority_features_lowering.cpp
new file mode 100644
index 00000000..8444c963
--- /dev/null
+++ b/src/midend/gpu_priority_features_lowering.cpp
@@ -0,0 +1,120 @@
+#include <graphit/midend/gpu_priority_features_lowering.h>
+
+namespace graphit {
+void GPUPriorityFeaturesLowering::lower(void) {
+	EdgeSetApplyPriorityRewriter rewriter(mir_context_, schedule_);
+	for (auto func: mir_context_->getFunctionList()) {
+		rewriter.rewrite(func);
+	}
+}
+void GPUPriorityFeaturesLowering::EdgeSetApplyPriorityRewriter::visit(mir::ExprStmt::Ptr expr_stmt) {
+	if (expr_stmt->stmt_label != "") {
+		label_scope_.scope(expr_stmt->stmt_label);
+	}
+	if (mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(expr_stmt->expr)) {
+		mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = mir::to<mir::UpdatePriorityEdgeSetApplyExpr>(expr_stmt->expr);
+		mir::FuncDecl::Ptr udf = mir_context_->getFunction(upesae->input_function->function_name->name);
+		UDFPriorityQueueFinder finder(mir_context_);
+		udf->accept(&finder);
+		mir::Var pq = finder.getPriorityQueue();
+		
+		mir::Var frontier(pq.getName() + ".frontier_", nullptr);
+		
+		mir::VarExpr::Ptr lhs = std::make_shared<mir::VarExpr>();
+		lhs->var = frontier;
+		
+		mir::AssignStmt::Ptr assign = std::make_shared<mir::AssignStmt>();
+		assign->lhs = lhs;
+		assign->expr = expr_stmt->expr;
+		node = assign;
+	
+		upesae->is_parallel = true;
+		upesae->requires_output = true;
+		upesae->priority_queue_used = pq;
+		mir::VarExpr::Ptr edgeset_expr = mir::to<mir::VarExpr>(upesae->target);
+		mir::EdgeSetType::Ptr edgeset_type = mir::to<mir::EdgeSetType>(edgeset_expr->var.getType());
+		assert(edgeset_type->vertex_element_type_list->size() == 2);
+		if (edgeset_type->weight_type != nullptr) {
+		    upesae->is_weighted = true;
+		}
+		// Now apply the schedule to the operator
+		if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+			auto current_scope_name = label_scope_.getCurrentScope();	
+			auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
+			if (apply_schedule_iter != schedule_->apply_gpu_schedules.end()) {
+				auto apply_schedule = apply_schedule_iter->second;
+				if (dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule) != nullptr) {
+					upesae->applied_schedule = *dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule);
+					mir_context_->delta_ = upesae->applied_schedule.delta;
+					if (upesae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {	
+						mir_context_->graphs_with_transpose[mir::to<mir::VarExpr>(upesae->target)->var.getName()] = true;
+					}
+				} else {
+					assert(false && "Scedule applied to edgesetapply must be a Simple Schedule");
+				}
+			}				
+		}
+		PriorityUpdateOperatorRewriter rewriter(mir_context_, upesae);
+		rewriter.rewrite(udf);	
+		if (expr_stmt->stmt_label != "") {
+			label_scope_.unscope();
+		}
+		return;	
+	}
+	if (expr_stmt->stmt_label != "") {
+		label_scope_.unscope();
+	}
+	mir::MIRRewriter::visit(expr_stmt);
+	return;
+}
+void GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::visit(mir::PriorityUpdateOperator::Ptr call) {
+	if (mir::isa<mir::VarExpr>(call->args[0])) {
+		insertVar(mir::to<mir::VarExpr>(call->args[0])->var);	
+	}
+}
+void GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::visit(mir::PriorityUpdateOperatorMin::Ptr call) {
+	mir::PriorityUpdateOperator::Ptr puo = call;
+	visit(puo);
+}
+void GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::visit(mir::PriorityUpdateOperatorSum::Ptr call) {
+	mir::PriorityUpdateOperator::Ptr puo = call;
+	visit(puo);
+}
+void GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::visit(mir::Call::Ptr call) {
+	if (call->name == "updatePriorityMin" || call->name == "UpdatePrioritySum") {
+		if (mir::isa<mir::VarExpr>(call->args[0])) {
+			insertVar(mir::to<mir::VarExpr>(call->args[0])->var);	
+		}
+	}
+}
+void GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::insertVar(mir::Var to_insert) {
+	for (auto var: priority_queues_used) {
+		if (var.getName() == to_insert.getName())
+			return;
+	}
+	priority_queues_used.push_back(to_insert);
+}
+mir::Var GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::getPriorityQueue(void) {
+	assert(priority_queues_used.size() == 1 && "Exactly one priority queue must be used in the UDF supplied to UpdatePriorityEdgeSetApplyExpr");
+	return priority_queues_used[0];
+}
+void GPUPriorityFeaturesLowering::PriorityUpdateOperatorRewriter::visit(mir::Call::Ptr call) {
+	if (call->name == "updatePriorityMin") {
+		mir::PriorityUpdateOperatorMin::Ptr update_op = std::make_shared<mir::PriorityUpdateOperatorMin>();
+		update_op->priority_queue = call->args[0];
+		update_op->destination_node_id = call->args[1];
+		update_op->old_val = call->args[2];
+		update_op->new_val = call->args[3];
+		update_op->edgeset_apply_expr = puesae_;
+		node = update_op;
+	} else if (call->name == "updatePrioritySum") {
+		mir::PriorityUpdateOperatorSum::Ptr update_op = std::make_shared<mir::PriorityUpdateOperatorSum>();
+		update_op->priority_queue = call->args[0];
+		update_op->destination_node_id = call->args[1];
+		update_op->delta = call->args[2];
+		update_op->minimum_val = call->args[3];
+		update_op->edgeset_apply_expr = puesae_;
+		node = update_op;
+	}
+}
+}
diff --git a/src/midend/gpu_vector_field_properties_analyzer.cpp b/src/midend/gpu_vector_field_properties_analyzer.cpp
new file mode 100644
index 00000000..2fa568f5
--- /dev/null
+++ b/src/midend/gpu_vector_field_properties_analyzer.cpp
@@ -0,0 +1,164 @@
+#include <graphit/midend/gpu_vector_field_properties_analyzer.h>
+
+namespace graphit {
+void GPUVectorFieldPropertiesAnalyzer::analyze(void) {
+	ApplyExprVisitor visitor(mir_context_);
+	for (auto func: mir_context_->getFunctionList()) {
+		func->accept(&visitor);
+	}	
+}
+void GPUVectorFieldPropertiesAnalyzer::ApplyExprVisitor::visit(mir::PushEdgeSetApplyExpr::Ptr pesae) {
+	// Push apply expression requires synchronization on src when using non vertex based load balance
+	// Push apply expression always requires synchronization on dst
+	std::unordered_set<std::string> idp_set;
+	mir::FuncDecl::Ptr func = mir_context_->getFunction(pesae->input_function->function_name->name);
+
+	std::string src_name = func->args[0].getName();
+	std::string dst_name = func->args[1].getName();
+
+	switch (pesae->applied_schedule.load_balancing) {
+		case fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::VERTEX_BASED:
+			idp_set.insert(src_name);
+			break;
+		default:
+			break;	
+	}	
+	
+	
+	PropertyAnalyzingVisitor visitor(mir_context_, idp_set, func);
+	func->accept(&visitor);
+}
+void GPUVectorFieldPropertiesAnalyzer::ApplyExprVisitor::visit(mir::PullEdgeSetApplyExpr::Ptr pesae) {
+	// Pull apply expression requires synchronization on dst when using non vertex based load balance
+	// Pull apply expression always requires synchronization on src
+	std::unordered_set<std::string> idp_set;
+	mir::FuncDecl::Ptr func = mir_context_->getFunction(pesae->input_function->function_name->name);
+
+	std::string src_name = func->args[0].getName();
+	std::string dst_name = func->args[1].getName();
+
+	switch (pesae->applied_schedule.load_balancing) {
+		case fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::VERTEX_BASED:
+			idp_set.insert(dst_name);
+			break;
+		default:
+			break;	
+	}	
+	
+	
+	PropertyAnalyzingVisitor visitor(mir_context_, idp_set, func);
+	func->accept(&visitor);
+}
+
+void GPUVectorFieldPropertiesAnalyzer::ApplyExprVisitor::visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr pesae) {
+	// UpdatePriority will function just like Push for now
+	std::unordered_set<std::string> idp_set;
+	mir::FuncDecl::Ptr func = mir_context_->getFunction(pesae->input_function->function_name->name);
+
+	std::string src_name = func->args[0].getName();
+	std::string dst_name = func->args[1].getName();
+
+	switch (pesae->applied_schedule.load_balancing) {
+		case fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::VERTEX_BASED:
+			idp_set.insert(src_name);
+			break;
+		default:
+			break;	
+	}	
+	
+	
+	PropertyAnalyzingVisitor visitor(mir_context_, idp_set, func);
+	func->accept(&visitor);
+	
+}
+
+
+
+bool GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::is_independent_index(mir::Expr::Ptr expr) {
+	if (mir::isa<mir::VarExpr>(expr)) {
+		mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(expr);
+		if (independent_variables.count(var_expr->var.getName()) > 0) {
+			return true;
+		}
+	}
+	if (mir::isa<mir::AddExpr>(expr)) {
+		mir::AddExpr::Ptr add_expr = mir::to<mir::AddExpr>(expr);
+		if (mir::isa<mir::IntLiteral>(add_expr->lhs) && is_independent_index(add_expr->rhs))
+			return true;
+		if (mir::isa<mir::IntLiteral>(add_expr->rhs) && is_independent_index(add_expr->lhs))
+			return true;
+	}
+	if (mir::isa<mir::MulExpr>(expr)) {
+		mir::MulExpr::Ptr mul_expr = mir::to<mir::MulExpr>(expr);
+		if (mir::isa<mir::IntLiteral>(mul_expr->lhs) && is_independent_index(mul_expr->rhs) && mir::to<mir::IntLiteral>(mul_expr->lhs)->val != 0)
+			return true;
+		if (mir::isa<mir::IntLiteral>(mul_expr->rhs) && is_independent_index(mul_expr->lhs) && mir::to<mir::IntLiteral>(mul_expr->rhs)->val != 0)
+			return true;
+	}
+
+	return false;
+	
+}
+void GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::visit(mir::TensorReadExpr::Ptr tre) {
+
+	tre->index->accept(this);
+
+	FieldVectorProperty property;
+	property.read_write_type = FieldVectorProperty::ReadWriteType::READ_ONLY;
+	if (is_independent_index(tre->index)) {
+		property.access_type_ = FieldVectorProperty::AccessType::LOCAL;
+	} else {
+		property.access_type_ = FieldVectorProperty::AccessType::SHARED;
+	}
+	tre->field_vector_prop_  = property;
+	std::string target = tre->getTargetNameStr();
+	enclosing_function->field_vector_properties_map_[target] = property;
+}
+void GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::visit(mir::AssignStmt::Ptr assign_stmt) {
+
+	assign_stmt->expr->accept(this);
+	
+	if (!mir::isa<mir::TensorReadExpr>(assign_stmt->lhs))
+		return;	
+
+
+	mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(assign_stmt->lhs);
+	tre->index->accept(this);
+	FieldVectorProperty property;
+	property.read_write_type = FieldVectorProperty::ReadWriteType::WRITE_ONLY;
+	if (is_independent_index(tre->index)) {
+		property.access_type_ = FieldVectorProperty::AccessType::LOCAL;
+	} else {
+		property.access_type_ = FieldVectorProperty::AccessType::SHARED;
+	}
+	tre->field_vector_prop_ = property;
+	std::string target = tre->getTargetNameStr();
+	enclosing_function->field_vector_properties_map_[target] = property;	
+}
+void GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::visit(mir::ReduceStmt::Ptr reduce_stmt) {
+	reduce_stmt->expr->accept(this);
+	
+	if (!mir::isa<mir::TensorReadExpr>(reduce_stmt->lhs))
+		return;
+	mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(reduce_stmt->lhs);
+	tre->index->accept(this);
+	FieldVectorProperty property;
+	property.read_write_type = FieldVectorProperty::ReadWriteType::READ_AND_WRITE;
+	if (is_independent_index(tre->index)) {
+		property.access_type_ = FieldVectorProperty::AccessType::LOCAL;
+	} else {
+		property.access_type_ = FieldVectorProperty::AccessType::SHARED;
+	}
+	tre->field_vector_prop_ = property;
+	std::string target = tre->getTargetNameStr();
+	enclosing_function->field_vector_properties_map_[target] = property;	
+	
+}
+void GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::visit(mir::PriorityUpdateOperatorMin::Ptr puo) {
+	mir::MIRVisitor::visit(puo);
+	mir::Expr::Ptr index_expr = puo->destination_node_id;
+	if (!is_independent_index(index_expr)) {
+		puo->is_atomic = true;	
+	}
+}
+}
diff --git a/src/midend/mir.cpp b/src/midend/mir.cpp
index 3fa45c78..eb05f2b6 100644
--- a/src/midend/mir.cpp
+++ b/src/midend/mir.cpp
@@ -94,6 +94,7 @@ namespace graphit {
             Call::copy(node);
             destination_node_id = expr->destination_node_id;
             priority_queue = expr->priority_queue;
+            edgeset_apply_expr = expr->edgeset_apply_expr;
         }
 
 
@@ -258,6 +259,8 @@ namespace graphit {
             enable_deduplication = expr->enable_deduplication;
             is_weighted = expr->is_weighted;
             scope_label_name = expr->scope_label_name;
+            frontier_reusable = expr->frontier_reusable;
+	
         }
 
 
@@ -1051,5 +1054,39 @@ namespace graphit {
 		return node;
 	}
 
+	void VertexSetDedupExpr::copy(MIRNode::Ptr node) {
+		const auto op = mir::to<VertexSetDedupExpr>(node);
+		target = op->target;	
+	}
+	MIRNode::Ptr VertexSetDedupExpr::cloneNode() {
+		const auto node = std::make_shared<VertexSetDedupExpr>();
+		node->copy(shared_from_this());
+		return node;
+	}
+
+
+	void HybridGPUStmt::copy(MIRNode::Ptr node) {
+		const auto op = mir::to<HybridGPUStmt>(node);
+		stmt1 = op->stmt1;
+		stmt2 = op->stmt2;
+	}
+	MIRNode::Ptr HybridGPUStmt::cloneNode() {
+		const auto node = std::make_shared<HybridGPUStmt>();
+		node->copy(shared_from_this());
+		return node;
+	}
+
+	void EnqueueVertex::copy(MIRNode::Ptr node) {
+		const auto op = mir::to<EnqueueVertex>(node);
+		vertex_id = op->vertex_id;
+		vertex_frontier = op->vertex_frontier;
+		type = op->type;
+	}
+	MIRNode::Ptr EnqueueVertex::cloneNode() {
+		const auto node = std::make_shared<EnqueueVertex>();
+		node->copy(shared_from_this());
+		return node;
+	}
+
     }
 }
diff --git a/src/midend/mir_emitter.cpp b/src/midend/mir_emitter.cpp
index ed3fa342..ac9f33e8 100644
--- a/src/midend/mir_emitter.cpp
+++ b/src/midend/mir_emitter.cpp
@@ -963,7 +963,10 @@ namespace graphit {
                 }
                 ctx->addEdgeSet(mir_var_decl);
                 ctx->addEdgesetType(mir_var_decl->name, type);
-
+	    } else if (std::dynamic_pointer_cast<mir::PriorityQueueType>(mir_var_decl->type) != nullptr) {
+		ctx->const_priority_queues_.push_back(mir_var_decl);
+                mir_var_decl->modifier = "const";
+                ctx->addConstant(mir_var_decl);
             } else {
                 mir_var_decl->modifier = "const";
                 ctx->addConstant(mir_var_decl);
diff --git a/src/midend/mir_lower.cpp b/src/midend/mir_lower.cpp
index ed7fce16..f7a85888 100644
--- a/src/midend/mir_lower.cpp
+++ b/src/midend/mir_lower.cpp
@@ -9,12 +9,17 @@
 #include <graphit/midend/par_for_lower.h>
 #include <graphit/midend/vector_op_lower.h>
 #include <graphit/midend/change_tracking_lower.h>
+#include <graphit/midend/gpu_change_tracking_lower.h>
 #include <graphit/midend/vector_field_properties_analyzer.h>
+#include <graphit/midend/gpu_vector_field_properties_analyzer.h>
 #include <graphit/midend/atomics_op_lower.h>
 #include <graphit/midend/vertex_edge_set_lower.h>
 #include <graphit/midend/merge_reduce_lower.h>
 #include <graphit/midend/udf_dup.h>
 #include <graphit/midend/priority_features_lowering.h>
+#include <graphit/midend/gpu_priority_features_lowering.h>
+#include <graphit/midend/while_loop_fusion.h>
+#include <graphit/midend/frontier_reuse_analysis.h>
 
 namespace graphit {
     /**
@@ -29,15 +34,23 @@ namespace graphit {
         UDFReuseFinder(mir_context).lower();
 
         //lower global vector assignment to vector operations
-        GlobalFieldVectorLower(mir_context).lower();
+        GlobalFieldVectorLower(mir_context, schedule).lower();
 
         //lower  global edgeset assignment (from loading)
         // needed for reading commandline arguments in the main function
         VertexEdgeSetLower(mir_context).lower();
 
 
-        //This pass needs to happen before ApplyExprLower pass because the default ReduceBeforeUpdate uses ApplyExprLower
-        PriorityFeaturesLower(mir_context, schedule).lower();
+	// We use the GPU version when the GPU Scheules are set
+	if (schedule != nullptr && !schedule->apply_gpu_schedules.empty()) {
+		GPUPriorityFeaturesLowering(mir_context, schedule).lower();
+	} else  {
+		//This pass needs to happen before ApplyExprLower pass because the default ReduceBeforeUpdate uses ApplyExprLower
+		PriorityFeaturesLower(mir_context, schedule).lower();
+	}
+
+	// This pass finds EdgeSetApplyExpressions that allow frontiers to be reused and removes the corresponding deletes
+	FrontierReuseAnalysis(mir_context).analyze();
 
         // This pass sets properties of edgeset apply expressions based on the schedules including
         // edge traversal direction: push, pull, denseforward, hybrid_dense, hybrid_denseforward
@@ -59,7 +72,11 @@ namespace graphit {
         // Use program analysis to figure out the properties of each tensor access
         // read write type: read/write/read and write (reduction)
         // access type: shared or local
-        VectorFieldPropertiesAnalyzer(mir_context,schedule).analyze();
+	if (schedule != nullptr && !schedule->apply_gpu_schedules.empty()) {
+		GPUVectorFieldPropertiesAnalyzer(mir_context, schedule).analyze();
+	} else {
+		VectorFieldPropertiesAnalyzer(mir_context, schedule).analyze();
+	}
 
         // The pass on lowering abstract data structures to
         // concrete data structures with physical layout information (arrays, field of a struct, dictionary)
@@ -67,16 +84,25 @@ namespace graphit {
 
         // This pass inserts atomic operations, including CAS, writeMin, writeAdd
         // This pass does not need the schedule
-        AtomicsOpLower(mir_context).lower();
+        AtomicsOpLower(mir_context, schedule).lower();
 
         // This pass generates code for tracking if a field has been modified
         // during the execution of the edgeset apply functions.
         // It return values for implicit tracking of changes to certain field
-        ChangeTrackingLower(mir_context, schedule).lower();
+	if (schedule != nullptr && !schedule->apply_gpu_schedules.empty()) {
+		// No change tracking lower for GPUs
+		GPUChangeTrackingLower(mir_context, schedule).lower();
+	} else {	
+        	ChangeTrackingLower(mir_context, schedule).lower();
+	}
 
         // This pass extracts the merge field and reduce operator. If numa_aware is set to true in
         // the schedule for the corresponding label, it also adds NUMA optimization
         MergeReduceLower(mir_context, schedule).lower();
+
+	// This pass lowers while loops that have fusion schedule attached to them 
+	WhileLoopFusion(mir_context, schedule).lower();	
+
     }
 }
 
diff --git a/src/midend/mir_rewriter.cpp b/src/midend/mir_rewriter.cpp
index 6b9fd9c7..d7b58e08 100644
--- a/src/midend/mir_rewriter.cpp
+++ b/src/midend/mir_rewriter.cpp
@@ -460,6 +460,22 @@ namespace graphit {
             ptr->target = rewrite<Expr>(ptr->target);
             node = ptr;
 	}
+	
+	void MIRRewriter::visit(VertexSetDedupExpr::Ptr ptr) {
+		ptr->target = rewrite<Expr>(ptr->target);
+		node = ptr;
+	}
+
+	void MIRRewriter::visit(HybridGPUStmt::Ptr stmt) {
+		stmt->stmt1 = rewrite<StmtBlock>(stmt->stmt1);
+		stmt->stmt2 = rewrite<StmtBlock>(stmt->stmt2);
+		node = stmt;
+	}
+	void MIRRewriter::visit(EnqueueVertex::Ptr stmt) {
+		stmt->vertex_id = rewrite<Expr>(stmt->vertex_id);
+		stmt->vertex_frontier = rewrite<Expr>(stmt->vertex_frontier);
+		node = stmt;
+	}
 
     }
 }
diff --git a/src/midend/mir_visitor.cpp b/src/midend/mir_visitor.cpp
index adfec24d..4526a803 100644
--- a/src/midend/mir_visitor.cpp
+++ b/src/midend/mir_visitor.cpp
@@ -424,6 +424,17 @@ namespace graphit {
 	void MIRVisitor::visit(std::shared_ptr<UpdatePriorityEdgeCountEdgeSetApplyExpr> op) {
 		visit(std::static_pointer_cast<EdgeSetApplyExpr>(op));
 	}
-
+	
+	void MIRVisitor::visit(std::shared_ptr<VertexSetDedupExpr> op) {
+		op->target->accept(this);
+	}
+	void MIRVisitor::visit(std::shared_ptr<HybridGPUStmt> op) {
+		op->stmt1->accept(this);
+		op->stmt2->accept(this);		
+	}
+	void MIRVisitor::visit(std::shared_ptr<EnqueueVertex> op) {
+		op->vertex_id->accept(this);
+		op->vertex_frontier->accept(this);
+	}
     }
 }
diff --git a/src/midend/priority_features_lowering.cpp b/src/midend/priority_features_lowering.cpp
index ef0ff412..5bf6f483 100644
--- a/src/midend/priority_features_lowering.cpp
+++ b/src/midend/priority_features_lowering.cpp
@@ -49,10 +49,13 @@ namespace graphit {
             function->accept(&lower_update_priority_edge_set_apply_expr);
         }
 
-        // Detect pattern for OrderedProcessingOperator, and lower into the MIR node for OrderedProcessingOp
-        auto lower_ordered_processing_op = LowerIntoOrderedProcessingOperatorRewriter(schedule_, mir_context_);
-        for (auto function : functions) {
-            lower_ordered_processing_op.rewrite(function);
+        if (mir_context_->priority_update_type == mir::PriorityUpdateType::EagerPriorityUpdateWithMerge ||
+                mir_context_->priority_update_type == mir::PriorityUpdateType::EagerPriorityUpdate){
+            // Detect pattern for OrderedProcessingOperator, and lower into the MIR node for OrderedProcessingOp
+            auto lower_ordered_processing_op = LowerIntoOrderedProcessingOperatorRewriter(schedule_, mir_context_);
+            for (auto function : functions) {
+                lower_ordered_processing_op.rewrite(function);
+            }
         }
 
         // Lowers into PriorityUpdateOperators (PriorityUpdateMin and PriorityUpdateSum)
diff --git a/src/midend/vector_op_lower.cpp b/src/midend/vector_op_lower.cpp
index fd8110d7..20f8e3b4 100644
--- a/src/midend/vector_op_lower.cpp
+++ b/src/midend/vector_op_lower.cpp
@@ -28,110 +28,114 @@ namespace  graphit {
 
                 // do the lowering if the right handside is a call stmt (may be add if the right hand side is part of a struct)
                 if (mir::isa<mir::Call>(var_decl->initVal)){
-                    auto orig_init_val = var_decl->initVal;
-                    mir::VectorType::Ptr vector_type = std::dynamic_pointer_cast<mir::VectorType>(var_decl->type);
-
-                    if (mir::isa<mir::ScalarType>(vector_type->vector_element_type)){
-                        mir::ScalarType::Ptr element_type =  mir::to<mir::ScalarType>(
-                                vector_type->vector_element_type);
-                        //reset the initval to something default 0 for integer and float
-                        if (element_type->type == mir::ScalarType::Type::INT){
-                            //initial value should be a int
-                            auto zero = std::make_shared<mir::IntLiteral>();
-                            zero->val = 0;
-                            var_decl->initVal = zero;
-                        }
-                        else if (element_type->type == mir::ScalarType::Type::FLOAT){
-                            //initial value should be a float
-
-                        }
-
-                        //insert another const var decl as the temporary holder for the function
-			//this vector is always going to be assigned a value (pointer) returned from the function call and hence does not need allocation
-                        auto tmp_var_decl = std::make_shared<mir::VarDecl>();
-			tmp_var_decl->needs_allocation = false;
-                        tmp_var_decl->type = var_decl->type;
-                        tmp_var_decl->initVal = orig_init_val;
-                        tmp_var_decl->name = "generated_tmp_vector_" + mir_context_->getUniqueNameCounterString();
-                        tmp_var_decl->modifier = var_decl->modifier;
-                        mir_context_->insertNewConstVectorDeclEnd(tmp_var_decl);
-
-                        //create a new apply function decl that copies over the vector
-                        if (mir_context_->isVertexElementType(vector_type->element_type->ident)){
-                            //a vertexset apply function if the element is a vertexset
-                            mir::FuncDecl::Ptr copy_over_apply_func = std::make_shared<mir::FuncDecl>();
-                            // create a utility function for creating new vertexset apply
-                            // set up a name
-                            copy_over_apply_func->name = "generated_vector_op_apply_func_"
-                                                         + mir_context_->getUniqueNameCounterString();
-                            auto arg_var_type = vector_type->element_type;
-                            mir::Var arg_var = mir::Var("v", arg_var_type);
-                            std::vector<mir::Var> arg_var_list = std::vector<mir::Var>();
-                            arg_var_list.push_back(arg_var);
-                            copy_over_apply_func->args = arg_var_list;
-
-                            auto mir_stmt_body = std::make_shared<mir::StmtBlock>();
-                            auto assign_stmt = std::make_shared<mir::AssignStmt>();
-
-                            auto lhs = std::make_shared<mir::TensorReadExpr>(
-                                    var_decl->name, "v",
-                                    var_decl->type,
-                                    vector_type->element_type
-                                    );
-
-                            auto rhs = std::make_shared<mir::TensorReadExpr>(
-                                    tmp_var_decl->name, "v",
-                                    tmp_var_decl->type,
-                                    vector_type->element_type
-                            );
-
-                            assign_stmt->lhs = lhs;
-                            assign_stmt->expr = rhs;
-                            mir_stmt_body->insertStmtEnd(assign_stmt);
-                            copy_over_apply_func->body = mir_stmt_body;
-                            //insert the utility function back into function list
-                            mir_context_->insertFuncDeclFront(copy_over_apply_func);
-
-
-                            //make funcExpr to pass in to VertexSetApplyExpr
-                            mir::FuncExpr::Ptr funcExprApply = std::make_shared<mir::FuncExpr>();
-                            mir::IdentDecl::Ptr funcExprApplyIdentifier = std::make_shared<mir::IdentDecl>();
-                            funcExprApplyIdentifier->name = copy_over_apply_func->name;
-                            funcExprApply->function_name = funcExprApplyIdentifier;
-
-
-                            // Lastly, insert a vertexset apply expression at the beginning of main
-                            mir::VarDecl::Ptr global_vertex_set_var_decl = mir_context_->getGlobalConstVertexSet();
-                            mir::VertexSetApplyExpr::Ptr vertex_set_apply_expr =
-                                    std::make_shared<mir::VertexSetApplyExpr>(global_vertex_set_var_decl->name,
-                                                                              global_vertex_set_var_decl->type,
-                                                                              funcExprApply);
-                            mir::ExprStmt::Ptr apply_stmt = std::make_shared<mir::ExprStmt>();
-                            apply_stmt->expr = vertex_set_apply_expr;
-
-                            //No longer directly add to the main function block
-                            //mir::FuncDecl::Ptr main_func_decl = mir_context_->getMainFuncDecl();
-                            //main_func_decl->body->insertStmtFront(apply_stmt);
-
-
-                            //puts the current vector initilization block into the back of the initialization block
-                            //which would be later put into main function declaration
-                            //vector_initialization_block->insertStmtEnd(apply_stmt);
-
-
-                            auto tmp_var_assign_stmt = std::make_shared<mir::AssignStmt>();
-                            auto tmp_var_expr = std::make_shared<mir::VarExpr>();
-                            tmp_var_expr->var = mir::Var(tmp_var_decl->name, tmp_var_decl->type);
-                            tmp_var_assign_stmt->lhs = tmp_var_expr;
-                            tmp_var_assign_stmt->expr = tmp_var_decl->initVal;
-
-                            mir_context_->field_vector_init_stmts.push_back(tmp_var_assign_stmt);
-                            mir_context_->field_vector_init_stmts.push_back(apply_stmt);
-
-
-                        }
-                    }
-
+                    // Special case if this is GPU lowering
+                    
+		    if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+			   // Do nothing for GPU 
+		    } else {
+			    auto orig_init_val = var_decl->initVal;
+			    mir::VectorType::Ptr vector_type = std::dynamic_pointer_cast<mir::VectorType>(var_decl->type);
+
+			    if (mir::isa<mir::ScalarType>(vector_type->vector_element_type)){
+				mir::ScalarType::Ptr element_type =  mir::to<mir::ScalarType>(
+					vector_type->vector_element_type);
+				//reset the initval to something default 0 for integer and float
+				if (element_type->type == mir::ScalarType::Type::INT){
+				    //initial value should be a int
+				    auto zero = std::make_shared<mir::IntLiteral>();
+				    zero->val = 0;
+				    var_decl->initVal = zero;
+				}
+				else if (element_type->type == mir::ScalarType::Type::FLOAT){
+				    //initial value should be a float
+
+				}
+
+				//insert another const var decl as the temporary holder for the function
+				//this vector is always going to be assigned a value (pointer) returned from the function call and hence does not need allocation
+				auto tmp_var_decl = std::make_shared<mir::VarDecl>();
+				tmp_var_decl->needs_allocation = false;
+				tmp_var_decl->type = var_decl->type;
+				tmp_var_decl->initVal = orig_init_val;
+				tmp_var_decl->name = "generated_tmp_vector_" + mir_context_->getUniqueNameCounterString();
+				tmp_var_decl->modifier = var_decl->modifier;
+				mir_context_->insertNewConstVectorDeclEnd(tmp_var_decl);
+
+				//create a new apply function decl that copies over the vector
+				if (mir_context_->isVertexElementType(vector_type->element_type->ident)){
+				    //a vertexset apply function if the element is a vertexset
+				    mir::FuncDecl::Ptr copy_over_apply_func = std::make_shared<mir::FuncDecl>();
+				    // create a utility function for creating new vertexset apply
+				    // set up a name
+				    copy_over_apply_func->name = "generated_vector_op_apply_func_"
+								 + mir_context_->getUniqueNameCounterString();
+				    auto arg_var_type = vector_type->element_type;
+				    mir::Var arg_var = mir::Var("v", arg_var_type);
+				    std::vector<mir::Var> arg_var_list = std::vector<mir::Var>();
+				    arg_var_list.push_back(arg_var);
+				    copy_over_apply_func->args = arg_var_list;
+
+				    auto mir_stmt_body = std::make_shared<mir::StmtBlock>();
+				    auto assign_stmt = std::make_shared<mir::AssignStmt>();
+
+				    auto lhs = std::make_shared<mir::TensorReadExpr>(
+					    var_decl->name, "v",
+					    var_decl->type,
+					    vector_type->element_type
+					    );
+
+				    auto rhs = std::make_shared<mir::TensorReadExpr>(
+					    tmp_var_decl->name, "v",
+					    tmp_var_decl->type,
+					    vector_type->element_type
+				    );
+
+				    assign_stmt->lhs = lhs;
+				    assign_stmt->expr = rhs;
+				    mir_stmt_body->insertStmtEnd(assign_stmt);
+				    copy_over_apply_func->body = mir_stmt_body;
+				    //insert the utility function back into function list
+				    mir_context_->insertFuncDeclFront(copy_over_apply_func);
+
+				    //make funcExpr to pass in to VertexSetApplyExpr
+				    mir::FuncExpr::Ptr funcExprApply = std::make_shared<mir::FuncExpr>();
+				    mir::IdentDecl::Ptr funcExprApplyIdentifier = std::make_shared<mir::IdentDecl>();
+				    funcExprApplyIdentifier->name = copy_over_apply_func->name;
+				    funcExprApply->function_name = funcExprApplyIdentifier;
+
+
+				    // Lastly, insert a vertexset apply expression at the beginning of main
+				    mir::VarDecl::Ptr global_vertex_set_var_decl = mir_context_->getGlobalConstVertexSet();
+				    mir::VertexSetApplyExpr::Ptr vertex_set_apply_expr =
+					    std::make_shared<mir::VertexSetApplyExpr>(global_vertex_set_var_decl->name,
+										      global_vertex_set_var_decl->type,
+										      funcExprApply);
+				    mir::ExprStmt::Ptr apply_stmt = std::make_shared<mir::ExprStmt>();
+				    apply_stmt->expr = vertex_set_apply_expr;
+
+				    //No longer directly add to the main function block
+				    //mir::FuncDecl::Ptr main_func_decl = mir_context_->getMainFuncDecl();
+				    //main_func_decl->body->insertStmtFront(apply_stmt);
+
+
+				    //puts the current vector initilization block into the back of the initialization block
+				    //which would be later put into main function declaration
+				    //vector_initialization_block->insertStmtEnd(apply_stmt);
+
+
+				    auto tmp_var_assign_stmt = std::make_shared<mir::AssignStmt>();
+				    auto tmp_var_expr = std::make_shared<mir::VarExpr>();
+				    tmp_var_expr->var = mir::Var(tmp_var_decl->name, tmp_var_decl->type);
+				    tmp_var_assign_stmt->lhs = tmp_var_expr;
+				    tmp_var_assign_stmt->expr = tmp_var_decl->initVal;
+
+				    mir_context_->field_vector_init_stmts.push_back(tmp_var_assign_stmt);
+				    mir_context_->field_vector_init_stmts.push_back(apply_stmt);
+
+
+				}
+			    }
+		    }
                 } else {
                     //field vector property and the initialization is not through a call stmt
                     //create a new apply function decl that initializes every value
diff --git a/src/midend/while_loop_fusion.cpp b/src/midend/while_loop_fusion.cpp
new file mode 100644
index 00000000..6304f170
--- /dev/null
+++ b/src/midend/while_loop_fusion.cpp
@@ -0,0 +1,35 @@
+#include <graphit/midend/while_loop_fusion.h>
+
+void graphit::WhileLoopFusion::lower(void) {	
+    std::vector<mir::FuncDecl::Ptr> functions = mir_context_->getFunctionList();
+    for (auto function : functions) {
+        function->accept(this);
+    }
+}
+void graphit::WhileLoopFusion::visit(mir::WhileStmt::Ptr while_stmt) {
+	if (while_stmt->stmt_label != "") {
+		label_scope_.scope(while_stmt->stmt_label);
+	}
+	while_stmt->cond->accept(this);
+	while_stmt->body->accept(this);
+	if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+		auto current_scope_name = label_scope_.getCurrentScope();
+		auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
+		if (apply_schedule_iter != schedule_->apply_gpu_schedules.end()) {
+			auto apply_schedule = apply_schedule_iter->second;
+			if (dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule)) {
+				auto applied_simple_schedule = dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule);
+				if (applied_simple_schedule->kernel_fusion == fir::gpu_schedule::SimpleGPUSchedule::kernel_fusion_type::FUSION_ENABLED) {
+					while_stmt->is_fused = true; 
+					mir_context_->fused_while_loops.push_back(while_stmt);
+				}
+			}
+		}
+	}
+
+	if (while_stmt->stmt_label != "") {
+		label_scope_.unscope();
+	}
+
+}
+
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
new file mode 100644
index 00000000..a1a54b24
--- /dev/null
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -0,0 +1,43 @@
+#ifndef GPU_INTRINSICS_H
+#define GPU_INTRINSICS_H
+
+#include <iostream>
+#include <string>
+
+#include "infra_gpu/graph.h"
+#include "infra_gpu/vertex_frontier.h"
+#include "infra_gpu/vertex_representation.h"
+#include "infra_gpu/load_balance.h"
+#include "graphit_timer.h"
+#include "infra_gpu/support.h"
+#include "infra_gpu/printer.h"
+#include "infra_gpu/gpu_priority_queue.h"
+#include "infra_gpu/list.h"
+
+namespace gpu_runtime {
+
+void deleteObject(VertexFrontier &t) {
+	delete_vertex_frontier(t);
+}
+
+template <typename T>
+void deleteObject(GPUPriorityQueue<T> &pq) {
+	pq.release();
+}
+
+ void * no_args[1];
+
+float str_to_float(const char* str) {
+	float val;
+	if (sscanf(str, "%f", &val) != 1)
+		return 0.0;
+	return val;
+}
+int32_t str_to_int(const char* str) {
+	int32_t val;
+	if (sscanf(str, "%i", &val) != 1)
+		return 0;
+	return val;
+}
+}
+#endif
diff --git a/src/runtime_lib/graphit_timer.h b/src/runtime_lib/graphit_timer.h
new file mode 100644
index 00000000..1eed0c79
--- /dev/null
+++ b/src/runtime_lib/graphit_timer.h
@@ -0,0 +1,19 @@
+#ifndef GRAPHIT_TIMER_H
+#define GRAPHIT_TIMER_H
+#include <sys/time.h>
+
+static struct timeval start_time_;
+static struct timeval elapsed_time_;
+
+static void startTimer(){
+    gettimeofday(&start_time_, NULL);
+}
+
+static float stopTimer(){
+    gettimeofday(&elapsed_time_, NULL);
+    elapsed_time_.tv_sec  -= start_time_.tv_sec;
+    elapsed_time_.tv_usec -= start_time_.tv_usec;
+    return elapsed_time_.tv_sec + elapsed_time_.tv_usec/1e6;
+
+}
+#endif
diff --git a/src/runtime_lib/infra_gapbs/graph.h b/src/runtime_lib/infra_gapbs/graph.h
index 7d77dbd8..0d7f9649 100644
--- a/src/runtime_lib/infra_gapbs/graph.h
+++ b/src/runtime_lib/infra_gapbs/graph.h
@@ -143,8 +143,10 @@ class CSRGraph {
 
 
  public:
+#ifndef IGNORE_JULIENNE_TYPES
   julienne::graph<julienne::symmetricVertex> julienne_graph = __julienne_null_graph;
   //julienne::EdgeMap<julienne::uintE, julienne::symmetricVertex> *em;
+#endif
   CSRGraph() : directed_(false), num_nodes_(-1), num_edges_(-1),
     out_index_(nullptr), out_neighbors_(nullptr),
   in_index_(nullptr), in_neighbors_(nullptr), flags_(nullptr), is_transpose_(false) {}
diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
new file mode 100755
index 00000000..e3f251de
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -0,0 +1,291 @@
+#ifndef GPU_PRIORITY_QUEUE_H
+#define GPU_PRIORITY_QUEUE_H
+
+#include <algorithm>
+#include <cinttypes>
+#include "vertex_frontier.h" 
+
+#ifndef NUM_BLOCKS
+#define NUM_BLOCKS 80
+#endif
+
+#ifndef CTA_SIZE
+#define CTA_SIZE 1024
+#endif
+
+
+namespace gpu_runtime {
+
+template<typename PriorityT_>
+	class GPUPriorityQueue;
+
+static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices);
+static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier);
+static void __device__ update_nodes_identify_min_device(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices);
+static void __device__ update_nodes_special_device(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier);
+
+template<typename PriorityT_>
+	class GPUPriorityQueue {
+
+	public:
+
+		size_t getCurrentPriority(){
+			return current_priority_;
+		}
+
+		void init(GraphT<int32_t> graph, PriorityT_ * host_priorities, PriorityT_* device_priorities, PriorityT_ initial_priority, PriorityT_ delta, NodeID initial_node = -1){
+			host_priorities_ = host_priorities;
+			device_priorities_ = device_priorities;
+			current_priority_ = initial_priority;
+			delta_ = delta;
+			ready_set_dequeued = false;
+			frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+			frontier_.d_priority_array = device_priorities;
+			frontier_.priority_cutoff = current_priority_ + delta_;
+			cudaMalloc(&current_priority_shared, sizeof(PriorityT_));
+			if (initial_node != -1){
+				gpu_runtime::builtin_addVertex(frontier_, initial_node);
+			}
+		}
+		void release(void) {
+			delete_vertex_frontier(frontier_);
+		}
+
+		void __device__ updatePriorityMin(GPUPriorityQueue<PriorityT_> * device_gpq,  PriorityT_ new_priority, VertexFrontier output_frontier, int32_t node){
+			bool output = gpu_runtime::writeMin(&(device_gpq->device_priorities_[node]), new_priority);
+			if (device_gpq->device_priorities_[node] >= (device_gpq->current_priority_ + device_gpq->delta_)) return;
+			if (output){
+				enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, node);
+			}
+
+		}
+
+		bool finished(GPUPriorityQueue<PriorityT_> * device_gpq) {
+			if (current_priority_ == INT_MAX){
+				return true;
+			}
+
+			if (!ready_set_dequeued && gpu_runtime::builtin_getVertexSetSize(frontier_) == 0){
+				dequeueReadySet(device_gpq);
+				ready_set_dequeued = true;
+				return current_priority_ == INT_MAX;
+			} 
+
+			return false;
+		}
+#ifdef GLOBAL
+		bool __device__ device_finished(void) {
+			if (current_priority_ == INT_MAX)
+				return true;
+			if (!ready_set_dequeued && gpu_runtime::device_builtin_getVertexSetSize(frontier_) == 0) {
+				device_dequeueReadySet();
+				if (threadIdx.x + blockIdx.x * blockDim.x == 0)
+					ready_set_dequeued = true;
+				this_grid().sync();
+				return current_priority_ == INT_MAX;
+			}
+			return false;
+		}
+#endif
+		bool __device__ device_finished(void) {
+			if(current_priority_ == INT_MAX)
+				return true;
+			if (!ready_set_dequeued && gpu_runtime::device_builtin_getVertexSetSize(frontier_) == 0) {
+				device_dequeueReadySet();
+				ready_set_dequeued = true;
+				return current_priority_ == INT_MAX;
+			}
+			return false;
+		}
+
+		bool host_finishedNode(NodeID v){
+			return host_priorities_[v]/delta_ < current_priority_;
+		}
+
+		bool __device__ device_finishedNode(NodeID v){
+
+		}
+
+		VertexFrontier& dequeueReadySet(GPUPriorityQueue<PriorityT_> * device_gpq){
+			// if this is already dequeued in the previous finish() operator
+			// then don't do the dequeu operation again
+			if (ready_set_dequeued){
+				//Now that we dequeued it, the next ready set is no longer dequeued
+				ready_set_dequeued = false;
+				return frontier_;
+			}
+
+			//perform the dequeue operation only if the current frontier is empty
+			if (gpu_runtime::builtin_getVertexSetSize(frontier_) == 0) {
+				window_upper_ = current_priority_ + delta_;
+				current_priority_ = INT_MAX;
+
+				cudaMemcpy(current_priority_shared, &current_priority_, sizeof(int32_t), cudaMemcpyHostToDevice);
+				cudaMemcpy(device_gpq, this, sizeof(*device_gpq), cudaMemcpyHostToDevice); 
+				gpu_runtime::cudaCheckLastError();
+				update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems);
+				gpu_runtime::cudaCheckLastError();
+
+				cudaMemcpy(&(device_gpq->current_priority_), current_priority_shared, sizeof(int32_t), cudaMemcpyDeviceToHost);
+
+				cudaMemcpy(this, device_gpq, sizeof(*this), cudaMemcpyDeviceToHost);
+				gpu_runtime::cudaCheckLastError();
+
+				update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems,  frontier_);
+				gpu_runtime::cudaCheckLastError();
+				gpu_runtime::swap_queues(frontier_);
+				frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+
+				//Now that we dequeued it, the next ready set is no longer dequeued
+				frontier_.priority_cutoff = current_priority_ + delta_;
+				ready_set_dequeued = false;
+				return frontier_;
+			}
+
+			//if it is empty, just return the empty frontier
+			return frontier_;
+		}
+		
+		VertexFrontier __device__ device_dequeueReadySet(void) {
+			if (ready_set_dequeued) {
+				ready_set_dequeued = false;
+				return frontier_;
+			}
+			if (gpu_runtime::device_builtin_getVertexSetSize(frontier_) == 0) {
+				window_upper_ = current_priority_ + delta_;
+				current_priority_ = INT_MAX;
+				this_grid().sync();
+				if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+					current_priority_shared[0] = INT_MAX;
+				}
+				this_grid().sync();
+				
+				update_nodes_identify_min_device(this, frontier_.max_num_elems);
+				this_grid().sync();
+				
+				current_priority_ = current_priority_shared[0];
+				this_grid().sync();
+				update_nodes_special_device(this, frontier_.max_num_elems, frontier_);
+				gpu_runtime::swap_queues_device(frontier_);
+				frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+				ready_set_dequeued = false;
+				frontier_.priority_cutoff = current_priority_ + delta_;
+				return frontier_;
+			}
+			return frontier_;
+		}	
+
+
+#ifdef GLOBAL
+		VertexFrontier __device__ device_dequeueReadySet(void) {
+/*
+			if (threadIdx.x + blockDim.x * blockIdx.x == 0)
+				printf("Entering dequeue ready set\n");
+*/
+			if (ready_set_dequeued) {
+				this_grid().sync();
+				if (threadIdx.x + blockIdx.x * blockDim.x == 0)
+					ready_set_dequeued = false;
+				this_grid().sync();
+				return frontier_;
+			}
+			if (gpu_runtime::device_builtin_getVertexSetSize(frontier_) == 0) {
+/*				
+				if (threadIdx.x + blockDim.x * blockIdx.x == 0)
+					printf("Entering special case\n");
+*/
+				this_grid().sync();
+				if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+					window_upper_ = current_priority_ + delta_;
+					current_priority_ = INT_MAX;
+				}
+				this_grid().sync();
+				// No need for copy
+				update_nodes_identify_min_device(this, frontier_.max_num_elems);	
+				this_grid().sync();
+				update_nodes_special_device(this, frontier_.max_num_elems, frontier_);
+				this_grid().sync();
+				gpu_runtime::swap_queues_device_global(frontier_);
+				this_grid().sync();	
+				if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+					frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+					ready_set_dequeued = false;
+				}
+				this_grid().sync();
+				return frontier_;
+					
+			}
+			this_grid().sync();
+			return frontier_;
+		}
+#endif
+
+		PriorityT_* host_priorities_ = nullptr;
+		PriorityT_* device_priorities_ = nullptr;
+
+		PriorityT_ delta_ = 1;
+		PriorityT_ current_priority_ = 0;
+		PriorityT_ window_upper_ = 0;
+
+		//Need to do = {0} to avoid dynamic initialization error
+		VertexFrontier frontier_ = {0};
+		bool ready_set_dequeued = false;
+		
+		PriorityT_ *current_priority_shared = nullptr;
+	};
+
+
+static void __device__ update_nodes_identify_min_device(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices) {
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	int num_threads = blockDim.x * gridDim.x;
+	int total_work = num_vertices;
+	int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	int32_t my_minimum = INT_MAX;
+	for (int i = 0; i < work_per_thread; i++) {
+		int32_t node_id = thread_id + i * num_threads;
+		if (node_id < num_vertices) {
+			if (gpq->device_priorities_[node_id] >= (gpq->window_upper_) && gpq->device_priorities_[node_id] != INT_MAX && gpq->device_priorities_[node_id] < my_minimum) {
+				my_minimum = gpq->device_priorities_[node_id];
+			}
+		}
+	}
+
+	if (my_minimum < gpq->current_priority_shared[0]){
+		atomicMin(&(gpq->current_priority_shared[0]), my_minimum);
+	}
+}//end of update_nodes_identify_min
+
+
+
+static void __device__ update_nodes_special_device(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier){
+
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	int num_threads = blockDim.x * gridDim.x;
+	//int warp_id = thread_id / 32;
+
+	int total_work = num_vertices;
+	int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	for (int i = 0; i < work_per_thread; i++) {
+		int32_t node_id = thread_id + i * num_threads;
+		if (node_id < num_vertices) {
+			if(gpq->device_priorities_[node_id] >= gpq->current_priority_ && gpq->device_priorities_[node_id] < (gpq->current_priority_ + gpq->delta_)) {
+				gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, node_id);
+			}
+		}
+	}
+}
+
+
+static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices) {
+	update_nodes_identify_min_device(gpq, num_vertices);	
+}
+
+static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier){
+	update_nodes_special_device(gpq, num_vertices, output_frontier);
+}
+
+
+}
+
+
+#endif // GPU_PRIORITY_QUEUE_H
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
new file mode 100755
index 00000000..9fd91285
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -0,0 +1,351 @@
+#ifndef GPU_GRAPH_H
+#define GPU_GRAPH_H
+
+#include <assert.h>
+#include "infra_gpu/support.h"
+
+// GraphT data structure 
+#define IGNORE_JULIENNE_TYPES
+#include "infra_gapbs/benchmark.h"
+#include "infra_gpu/vertex_frontier.h"
+#include "graphit_timer.h"
+#ifndef FRONTIER_MULTIPLIER
+	#define FRONTIER_MULTIPLIER (6)
+#endif
+namespace gpu_runtime {
+
+template <typename EdgeWeightType>
+struct GraphT { // Field names are according to CSR, reuse for CSC
+	typedef EdgeWeightType EdgeWeightT;
+	int32_t num_vertices;
+	int32_t num_edges;
+
+	// Host pointers
+	int32_t *h_src_offsets; // num_vertices + 1;
+	int32_t *h_edge_src; // num_edges;
+	int32_t *h_edge_dst; // num_edges;
+	EdgeWeightType *h_edge_weight; // num_edges;
+
+	// Device pointers
+	int32_t *d_src_offsets; // num_vertices + 1;
+	int32_t *d_edge_src; // num_edges;
+	int32_t *d_edge_dst; // num_edges;
+	EdgeWeightType *d_edge_weight; // num_edges;
+
+	GraphT<EdgeWeightType> *transposed_graph;
+	
+
+	int32_t h_get_degree(int32_t vertex_id) {
+		return h_src_offsets[vertex_id + 1] - h_src_offsets[vertex_id];
+	}
+	int32_t __device__ d_get_degree(int32_t vertex_id) {
+		return d_src_offsets[vertex_id + 1] - d_src_offsets[vertex_id];
+	}
+	VertexFrontier full_frontier;
+	VertexFrontier& getFullFrontier(void) {
+		full_frontier.max_num_elems = num_vertices;
+		return full_frontier;
+	}
+	VertexFrontier& __device__ getFullFrontierDevice(void) {
+		full_frontier.max_num_elems = num_vertices;
+		return full_frontier;	
+	}
+
+
+	// Load balance scratch pads
+	// TWC bins
+	int32_t *twc_small_bin;
+	int32_t *twc_mid_bin;
+	int32_t *twc_large_bin;
+	
+	int32_t *twc_bin_sizes;
+
+	// strict frontiers
+	int32_t *strict_sum;
+	int32_t *strict_cta_sum;
+	int32_t *strict_grid_sum;
+
+
+	// blocking related parameters
+	int32_t num_buckets;
+	int32_t *h_bucket_sizes;
+	int32_t *d_bucket_sizes;
+
+		
+};
+void consume(int32_t _) {
+}
+#define CONSUME consume
+template <typename EdgeWeightType>
+void static sort_with_degree(GraphT<EdgeWeightType> &graph) {
+	assert(false && "Sort with degree not yet implemented\n");
+	return;
+}
+static bool string_ends_with(const char* str, const char* sub_str) {
+	if (strlen(sub_str) > strlen(str))
+		return false;
+	int32_t len1 = strlen(str);
+	int32_t len2 = strlen(sub_str);
+	if (strcmp(str + len1 - len2, sub_str) == 0)
+		return true;
+	return false;
+}
+
+static int32_t identify_block_id (int32_t vid, int32_t blocking_size) {
+	return vid / blocking_size;
+}
+template <typename EdgeWeightType>
+static void block_graph_edges(GraphT<EdgeWeightType> &input_graph, GraphT<EdgeWeightType> &output_graph, int32_t blocking_size) {
+	output_graph = input_graph;
+	output_graph.h_src_offsets = nullptr;
+	output_graph.d_src_offsets = nullptr;
+
+	output_graph.h_edge_src = new int32_t[input_graph.num_edges];
+	output_graph.h_edge_dst = new int32_t[input_graph.num_edges];
+	output_graph.h_edge_weight = new EdgeWeightType[input_graph.num_edges];
+
+	int32_t num_blocks = (input_graph.num_vertices + blocking_size - 1)/blocking_size;
+	//std::cout << "num blocks " << num_blocks << std::endl;	
+	int32_t *block_sizes = new int32_t[num_blocks+1];		
+	for (int32_t id = 0; id < num_blocks+1; id++)
+		block_sizes[id] = 0;
+	
+	for (int32_t eid = 0; eid < input_graph.num_edges; eid++) {
+		int32_t dst = input_graph.h_edge_dst[eid];
+		int32_t block_id = identify_block_id(dst, blocking_size);
+		block_sizes[block_id+1] += 1;
+	}	
+	int32_t running_sum = 0;
+	for (int32_t bid = 0; bid < num_blocks; bid++) {
+		running_sum += block_sizes[bid];
+		block_sizes[bid] = running_sum;
+	}
+	block_sizes[0] = 0;
+	for (int32_t eid = 0; eid < input_graph.num_edges; eid++) {
+		int32_t dst = input_graph.h_edge_dst[eid];
+		int32_t block_id = identify_block_id(dst, blocking_size);
+		int32_t new_eid = block_sizes[block_id];
+		block_sizes[block_id]++;
+		output_graph.h_edge_src[new_eid] = input_graph.h_edge_src[eid];	
+		output_graph.h_edge_dst[new_eid] = input_graph.h_edge_dst[eid];	
+		output_graph.h_edge_weight[new_eid] = input_graph.h_edge_weight[eid];	
+	}
+	
+	//delete[] block_sizes;
+	output_graph.num_buckets = num_blocks;
+	output_graph.h_bucket_sizes = block_sizes;
+	
+
+
+	cudaFree(input_graph.d_edge_src);
+	cudaFree(input_graph.d_edge_dst);
+	cudaFree(input_graph.d_edge_weight);
+
+	cudaMalloc(&output_graph.d_edge_src, sizeof(int32_t) * output_graph.num_edges);
+	cudaMalloc(&output_graph.d_edge_dst, sizeof(int32_t) * output_graph.num_edges);
+	cudaMalloc(&output_graph.d_edge_weight, sizeof(EdgeWeightType) * output_graph.num_edges);
+	cudaMalloc(&output_graph.d_bucket_sizes, sizeof(int32_t) * num_blocks);
+	
+	
+	cudaMemcpy(output_graph.d_edge_src, output_graph.h_edge_src, sizeof(int32_t) * output_graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_edge_dst, output_graph.h_edge_dst, sizeof(int32_t) * output_graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_edge_weight, output_graph.h_edge_weight, sizeof(EdgeWeightType) * output_graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_bucket_sizes, output_graph.h_bucket_sizes, sizeof(int32_t) * num_blocks, cudaMemcpyHostToDevice);
+		
+}
+
+template <typename EdgeWeightType>
+static GraphT<EdgeWeightType> builtin_transpose(GraphT<EdgeWeightType> &graph) {
+	if (graph.transposed_graph != nullptr)
+		return *(graph.transposed_graph);
+	// For now we will return the same graph
+	// TODO: copy transpose implementation from infra_ CPU
+	GraphT<EdgeWeightType> output_graph;
+	output_graph.num_vertices = graph.num_vertices;
+	output_graph.num_edges = graph.num_edges;
+	
+	output_graph.h_src_offsets = new int32_t[graph.num_vertices+2];
+	output_graph.h_edge_src = new int32_t[graph.num_edges];
+	output_graph.h_edge_dst = new int32_t[graph.num_edges];
+	output_graph.h_edge_weight = new EdgeWeightType[graph.num_edges];
+	
+	for (int32_t i = 0; i < graph.num_vertices + 2; i++)
+		output_graph.h_src_offsets[i] = 0;
+	
+	// This will count the degree for each vertex in the transposed graph
+	for (int32_t i = 0; i < graph.num_edges; i++) {
+		int32_t dst = graph.h_edge_dst[i];
+		output_graph.h_src_offsets[dst+2]++;
+	}
+
+	// We will now create cummulative sums
+	for (int32_t i = 0; i < graph.num_vertices; i++) {
+		output_graph.h_src_offsets[i+2] += output_graph.h_src_offsets[i+1];	
+	}
+	
+	// Finally fill in the edges and the weights for the new graph		
+	for (int32_t i = 0; i < graph.num_edges; i++) {
+		int32_t dst = graph.h_edge_dst[i];
+		int32_t pos = output_graph.h_src_offsets[dst+1];
+		output_graph.h_src_offsets[dst+1]++;
+		output_graph.h_edge_src[pos] = dst;
+		output_graph.h_edge_dst[pos] = graph.h_edge_src[i];
+		output_graph.h_edge_weight[pos] = graph.h_edge_weight[i];
+	}
+
+	cudaMalloc(&output_graph.d_edge_src, sizeof(int32_t) * graph.num_edges);
+	cudaMalloc(&output_graph.d_edge_dst, sizeof(int32_t) * graph.num_edges);
+	cudaMalloc(&output_graph.d_edge_weight, sizeof(EdgeWeightType) * graph.num_edges);
+	cudaMalloc(&output_graph.d_src_offsets, sizeof(int32_t) * (graph.num_vertices + 1));
+	
+	
+	cudaMemcpy(output_graph.d_edge_src, output_graph.h_edge_src, sizeof(int32_t) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_edge_dst, output_graph.h_edge_dst, sizeof(int32_t) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_edge_weight, output_graph.h_edge_weight, sizeof(EdgeWeightType) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_src_offsets, output_graph.h_src_offsets, sizeof(int32_t) * (graph.num_vertices + 1), cudaMemcpyHostToDevice);
+	
+/*	
+	cudaMalloc(&output_graph.twc_small_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&output_graph.twc_mid_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&output_graph.twc_large_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&output_graph.twc_bin_sizes, 3 * sizeof(int32_t));
+
+	cudaMalloc(&output_graph.strict_sum, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&output_graph.strict_cta_sum, NUM_CTA * 2 * sizeof(int32_t));
+	cudaMalloc(&output_graph.strict_grid_sum, sizeof(int32_t));
+*/
+	output_graph.twc_small_bin = graph.twc_small_bin;
+	output_graph.twc_mid_bin = graph.twc_mid_bin;
+	output_graph.twc_large_bin = graph.twc_large_bin;
+	output_graph.strict_sum = graph.strict_sum;
+	output_graph.strict_cta_sum = graph.strict_cta_sum;
+	output_graph.strict_grid_sum = output_graph.strict_grid_sum;
+
+	output_graph.transposed_graph = &graph;
+	graph.transposed_graph = new GraphT<EdgeWeightType>(output_graph);
+
+	
+	return output_graph;
+}
+
+template <typename EdgeWeightType>
+static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool to_sort = false) {
+	int flen = strlen(filename.c_str());
+	const char* bin_extension = to_sort?".graphit_sbin":".graphit_bin";
+	char bin_filename[1024];
+	strcpy(bin_filename, filename.c_str());
+
+	if (string_ends_with(filename.c_str(), bin_extension) == false)	 {
+		strcat(bin_filename, ".");
+		strcat(bin_filename, typeid(EdgeWeightType).name());
+		strcat(bin_filename, bin_extension);
+	}
+	
+	FILE *bin_file = fopen(bin_filename, "rb");
+	if (!bin_file && string_ends_with(filename.c_str(), bin_extension)) {
+		std::cout << "Binary file not found" << std::endl;
+		exit(-1);
+	}
+	if (bin_file) {
+		CONSUME(fread(&graph.num_vertices, sizeof(int32_t), 1, bin_file));
+		CONSUME(fread(&graph.num_edges, sizeof(int32_t), 1, bin_file));
+		
+		graph.h_edge_src = new int32_t[graph.num_edges];
+		graph.h_edge_dst = new int32_t[graph.num_edges];
+		graph.h_edge_weight = new EdgeWeightType[graph.num_edges];
+		
+		graph.h_src_offsets = new int32_t[graph.num_vertices + 1];
+		
+		CONSUME(fread(graph.h_edge_src, sizeof(int32_t), graph.num_edges, bin_file));
+		CONSUME(fread(graph.h_edge_dst, sizeof(int32_t), graph.num_edges, bin_file));
+		CONSUME(fread(graph.h_edge_weight, sizeof(EdgeWeightType), graph.num_edges, bin_file));
+
+		CONSUME(fread(graph.h_src_offsets, sizeof(int32_t), graph.num_vertices + 1, bin_file));
+		fclose(bin_file);	
+	} else {
+		CLBase cli (filename);
+		WeightedBuilder builder (cli);
+		WGraph g = builder.MakeGraph();
+		graph.num_vertices = g.num_nodes();
+		graph.num_edges = g.num_edges();
+
+		graph.h_edge_src = new int32_t[graph.num_edges];
+		graph.h_edge_dst = new int32_t[graph.num_edges];
+		graph.h_edge_weight = new EdgeWeightType[graph.num_edges];
+		
+		graph.h_src_offsets = new int32_t[graph.num_vertices + 1];
+		
+		int32_t tmp = 0;
+		graph.h_src_offsets[0] = tmp;
+		for (int32_t i = 0; i < g.num_nodes(); i++) {
+			for (auto j: g.out_neigh(i)) {
+				graph.h_edge_src[tmp] = i;
+				graph.h_edge_dst[tmp] = j.v;
+				graph.h_edge_weight[tmp] = j.w;	
+				tmp++;
+			}
+			graph.h_src_offsets[i+1] = tmp;
+		}	
+		if (to_sort)
+			sort_with_degree(graph);
+		FILE *bin_file = fopen(bin_filename, "wb");
+		CONSUME(fwrite(&graph.num_vertices, sizeof(int32_t), 1, bin_file));
+		CONSUME(fwrite(&graph.num_edges, sizeof(int32_t), 1, bin_file));
+		CONSUME(fwrite(graph.h_edge_src, sizeof(int32_t), graph.num_edges, bin_file));
+		CONSUME(fwrite(graph.h_edge_dst, sizeof(int32_t), graph.num_edges, bin_file));
+		CONSUME(fwrite(graph.h_edge_weight, sizeof(EdgeWeightType), graph.num_edges, bin_file));
+		CONSUME(fwrite(graph.h_src_offsets, sizeof(int32_t), graph.num_vertices + 1, bin_file));
+		fclose(bin_file);	
+	}
+
+	cudaMalloc(&graph.d_edge_src, sizeof(int32_t) * graph.num_edges);
+	cudaMalloc(&graph.d_edge_dst, sizeof(int32_t) * graph.num_edges);
+	cudaMalloc(&graph.d_edge_weight, sizeof(EdgeWeightType) * graph.num_edges);
+	cudaMalloc(&graph.d_src_offsets, sizeof(int32_t) * (graph.num_vertices + 1));
+	
+	
+	cudaMemcpy(graph.d_edge_src, graph.h_edge_src, sizeof(int32_t) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(graph.d_edge_dst, graph.h_edge_dst, sizeof(int32_t) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(graph.d_edge_weight, graph.h_edge_weight, sizeof(EdgeWeightType) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(graph.d_src_offsets, graph.h_src_offsets, sizeof(int32_t) * (graph.num_vertices + 1), cudaMemcpyHostToDevice);
+	//std::cout << filename << " (" << graph.num_vertices << ", " << graph.num_edges << ")" << std::endl;
+
+	cudaMalloc(&graph.twc_small_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&graph.twc_mid_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&graph.twc_large_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&graph.twc_bin_sizes, 3 * sizeof(int32_t));
+
+	cudaMalloc(&graph.strict_sum, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&graph.strict_cta_sum, NUM_CTA * 2 * sizeof(int32_t));
+	cudaMalloc(&graph.strict_grid_sum, sizeof(int32_t));
+	
+	graph.transposed_graph = nullptr;
+
+}
+
+template <typename EdgeWeightType>
+static int32_t builtin_getVertices(GraphT<EdgeWeightType> &graph) {
+	return graph.num_vertices;
+}
+
+template <typename EdgeWeightType>
+static int32_t __device__ device_builtin_getVertices(GraphT<EdgeWeightType> &graph) {
+	return graph.num_vertices;
+}
+
+template <typename EdgeWeightType> 
+void __global__ init_degrees_kernel(int32_t *degrees, GraphT<EdgeWeightType> graph) {
+	for (int32_t vid = threadIdx.x + blockIdx.x * blockDim.x; vid < graph.num_vertices; vid += gridDim.x * blockDim.x) 
+		degrees[vid] = graph.d_get_degree(vid);
+}
+
+template <typename EdgeWeightType>
+static int32_t* builtin_getOutDegrees(GraphT<EdgeWeightType> &graph) {
+	int32_t *degrees = nullptr;
+	cudaMalloc(&degrees, sizeof(int32_t) * graph.num_vertices);
+	init_degrees_kernel<<<NUM_CTA, CTA_SIZE>>>(degrees, graph);
+	return degrees;
+}
+
+}
+#endif
diff --git a/src/runtime_lib/infra_gpu/list.h b/src/runtime_lib/infra_gpu/list.h
new file mode 100644
index 00000000..43e76441
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/list.h
@@ -0,0 +1,108 @@
+#ifndef GRAPHIT_GPU_LIST_H
+#define GRAPHIT_GPU_LIST_H
+
+#include <vector>
+#include <cooperative_groups.h>
+using namespace cooperative_groups;
+
+namespace gpu_runtime {
+/*
+template <typename T>
+static void builtin_append(std::vector<T> &vec, T elem) {
+	vec.push_back(elem);	
+}
+
+template <typename T>
+static T builtin_pop(std::vector<T> &vec) {
+	T ret = vec.back();
+	vec.pop_back();
+	return ret;
+}
+*/
+
+class VertexFrontierList {
+public:
+	int32_t max_num_elems; 
+	int32_t current_levels;
+	
+	int32_t * d_level_indices;
+	int32_t * d_vertices;	
+};
+
+VertexFrontierList create_new_vertex_frontier_list(int32_t max_elems) {
+	VertexFrontierList vl;
+	vl.max_num_elems = max_elems;
+	vl.current_levels = 0;
+	
+	cudaMalloc(&(vl.d_level_indices), sizeof(int32_t) * (max_elems + 1));	
+	//vl.h_level_indices = new int32_t [max_elems + 1];	
+	//vl.h_level_indices[0] = 0;
+	cudaMemset(vl.d_level_indices, 0, sizeof(int32_t));
+	cudaMalloc(&(vl.d_vertices), sizeof(int32_t) * max_elems);
+	return vl;
+}
+
+
+void builtin_insert(VertexFrontierList &vl, VertexFrontier &frontier) {
+	int32_t array[2];
+
+	cudaMemcpy(array, vl.d_level_indices + vl.current_levels, sizeof(int32_t), cudaMemcpyDeviceToHost);
+	vertex_set_prepare_sparse(frontier);	
+	frontier.format_ready = VertexFrontier::SPARSE;
+	//int32_t at = vl.h_level_indices[vl.current_levels];
+	int32_t at = array[0];
+	int32_t num_elems = builtin_getVertexSetSize(frontier);
+	cudaMemcpy(vl.d_vertices + at, frontier.d_sparse_queue_input, num_elems * sizeof(int32_t), cudaMemcpyDeviceToDevice);
+	//vl.h_level_indices[vl.current_levels + 1] = at + num_elems;	
+	array[1] = at + num_elems;
+
+	cudaMemcpy(vl.d_level_indices + vl.current_levels + 1, array + 1, sizeof(int32_t), cudaMemcpyHostToDevice);
+	vl.current_levels++;
+}
+
+void __device__ device_builtin_insert(VertexFrontierList &vl, VertexFrontier &frontier) {
+	vertex_set_prepare_sparse_device(frontier);
+	frontier.format_ready = VertexFrontier::SPARSE;
+
+	int32_t at = vl.d_level_indices[vl.current_levels];
+	int32_t num_elems = device_builtin_getVertexSetSize(frontier);
+	parallel_memcpy((unsigned char*)(vl.d_vertices + at), (unsigned char*)(frontier.d_sparse_queue_input), num_elems * sizeof(int32_t));
+	if (threadIdx.x == 0 && blockIdx.x == 0)
+		vl.d_level_indices[vl.current_levels + 1] = at + num_elems;
+	vl.current_levels++;
+	this_grid().sync();
+}
+
+
+void builtin_retrieve(VertexFrontierList &vl, VertexFrontier &frontier) {
+	if (vl.current_levels == 0) {
+		assert(false && "Too deep into vertex frontier list");
+	}	
+	int32_t array[2];
+
+	cudaMemcpy(array, vl.d_level_indices + vl.current_levels - 1, sizeof(int32_t)*2, cudaMemcpyDeviceToHost);
+	//int32_t at = vl.h_level_indices[vl.current_levels - 1];
+	//int32_t num_elems = vl.h_level_indices[vl.current_levels] - at;
+	int32_t at = array[0];
+	int32_t num_elems = array[1] - at;
+	cudaMemcpy(frontier.d_sparse_queue_input, vl.d_vertices + at, num_elems * sizeof(int32_t), cudaMemcpyDeviceToDevice);
+	cudaMemcpy(frontier.d_num_elems_input, &num_elems, sizeof(int32_t), cudaMemcpyHostToDevice);
+	frontier.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+	vl.current_levels--;
+}
+void __device__ device_builtin_retrieve(VertexFrontierList &vl, VertexFrontier &frontier) {
+	if (vl.current_levels == 0)
+		assert(false && "Too deep into vertex frontier list");
+	int32_t at = vl.d_level_indices[vl.current_levels -1];		
+	int32_t num_elems = vl.d_level_indices[vl.current_levels] - at;
+	parallel_memcpy((unsigned char*)frontier.d_sparse_queue_input, (unsigned char*) (vl.d_vertices + at), num_elems * sizeof(int32_t));
+	if (threadIdx.x == 0 && blockIdx.x == 0)
+		frontier.d_num_elems_input[0] = num_elems;
+	frontier.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+	vl.current_levels--;
+	this_grid().sync();
+}
+}
+
+
+#endif
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
new file mode 100644
index 00000000..91d3220d
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -0,0 +1,1044 @@
+#ifndef GRAPHIT_GPU_LOAD_BALANCE_H
+#define GRAPHIT_GPU_LOAD_BALANCE_H
+
+#include "infra_gpu/graph.h"
+#include "infra_gpu/vertex_frontier.h"
+#include "infra_gpu/gpu_priority_queue.h"
+#include <cooperative_groups.h>
+using namespace cooperative_groups;
+
+namespace gpu_runtime {
+
+template <typename EdgeWeightType>
+using load_balance_payload_type = void (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier);
+
+
+// VERTEX SET APPLY FUNCTIONS
+template <typename AccessorType, void body(int32_t vid)>
+static void __device__ vertex_set_apply(VertexFrontier &frontier) {
+	int32_t total_vertices = AccessorType::getSize(frontier);
+	for(int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < total_vertices; vidx += blockDim.x * gridDim.x) {
+		int32_t vid = AccessorType::getElement(frontier, vidx);
+		body(vid);
+	}
+}
+template <typename AccessorType, void body(int32_t vid)>
+static void __global__ vertex_set_apply_kernel(VertexFrontier frontier) {
+	vertex_set_apply<AccessorType, body>(frontier);
+} 
+
+// VERTEX BASED LOAD BALANCE FUNCTIONS
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+
+	int32_t vid = threadIdx.x + blockDim.x * cta_id;
+	if (vid >= AccessorType::getSize(input_frontier))
+		return;
+	int32_t src = AccessorType::getElement(input_frontier, vid);
+	for (int32_t eid = graph.d_src_offsets[src]; eid < graph.d_src_offsets[src+1]; eid++) {
+		if (src_filter(src) == false)
+			break;
+		int32_t dst = graph.d_edge_dst[eid];
+		load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);
+		
+	}
+
+/*
+	int32_t total_vertices = AccessorType::getSize(input_frontier);
+	for (int32_t vidx = threadIdx.x + blockDim.x * cta_id; vidx < total_vertices; vidx += num_cta * blockDim.x) {
+		int32_t src = AccessorType::getElement(input_frontier, vidx);
+		for (int32_t eid = graph.d_src_offsets[src]; eid < graph.d_src_offsets[src+1]; eid++) {
+			if (src_filter(src) == false)
+				break;
+			int32_t dst = graph.d_edge_dst[eid];
+			load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);
+		}	
+	}
+*/
+}
+template <typename AccessorType>
+void __host__ vertex_based_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+
+	int32_t num_threads = AccessorType::getSizeHost(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+
+	//num_cta = NUM_CTA;
+	cta_size = CTA_SIZE;
+}
+template <typename AccessorType>
+void __device__ vertex_based_load_balance_info_device(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSize(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ vertex_based_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	vertex_based_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ vertex_based_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	vertex_based_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
+	vertex_based_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ vertex_based_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	vertex_based_load_balance_info_device<AccessorType>(input_frontier, num_cta, cta_size);
+	this_grid().sync();
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		vertex_based_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);	
+		__syncthreads();
+	}
+	this_grid().sync();
+}
+
+// EDGE_ONLY LOAD BALANCE FUNCTIONS
+
+template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
+static void __device__ edge_only_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier input_frontier, VertexFrontier output_frontier, unsigned int cta_id, unsigned int total_cta) {
+	int32_t thread_id = blockDim.x * cta_id + threadIdx.x;
+	int32_t total_threads = blockDim.x * total_cta;
+	for (int32_t eid = thread_id; eid < graph.num_edges; eid += total_threads) {
+		int32_t src = graph.d_edge_src[eid];
+		if (src_filter(src) == true) {
+			int32_t dst = graph.d_edge_dst[eid];
+			load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);	
+		}
+	}		
+}
+
+template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
+static void __device__ edge_only_load_balance_blocked(GraphT<EdgeWeightType> &graph, VertexFrontier input_frontier, VertexFrontier output_frontier, unsigned int cta_id, unsigned int total_cta, int32_t index) {
+	int32_t thread_id = blockDim.x * cta_id + threadIdx.x;
+	int32_t total_threads = blockDim.x * total_cta;
+	int32_t starting_edge = index == 0?0:graph.d_bucket_sizes[index-1];
+	int32_t ending_edge = graph.d_bucket_sizes[index];
+	for (int32_t eid = thread_id + starting_edge; eid < ending_edge; eid += total_threads) {
+		int32_t src = graph.d_edge_src[eid];
+		if (src_filter(src) == true) {
+			int32_t dst = graph.d_edge_dst[eid];
+			load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);	
+		}
+	}		
+}
+template <typename AccessorType>
+void __host__ edge_only_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	num_cta = NUM_CTA;
+	cta_size = CTA_SIZE;
+}
+template <typename AccessorType>
+void __device__ edge_only_load_balance_info_device(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	num_cta = NUM_CTA;
+	cta_size = CTA_SIZE;
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ edge_only_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	edge_only_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ edge_only_load_balance_blocked_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	for (int32_t index = 0; index < graph.num_buckets; index++) {
+		edge_only_load_balance_blocked<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x, index);
+		__syncthreads();
+	}
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ edge_only_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	edge_only_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
+	edge_only_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __host__ edge_only_load_balance_blocked_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta = NUM_CTA;
+	int32_t cta_size = CTA_SIZE;
+	edge_only_load_balance_blocked_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ edge_only_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	vertex_based_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);	
+	this_grid().sync();
+}
+
+// TWCE LOAD BALANCE FUNCTIONS
+#define STAGE_1_SIZE (8)
+#define WARP_SIZE (32)
+template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
+
+static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier, unsigned int cta_id, unsigned int total_cta) {
+	int32_t thread_id = blockDim.x * cta_id + threadIdx.x;
+	
+	int32_t lane_id = thread_id % 32;
+	
+	__shared__ int32_t stage2_queue[CTA_SIZE];
+	__shared__ int32_t stage3_queue[CTA_SIZE];
+	__shared__ int32_t stage_queue_sizes[3];
+	
+	if (threadIdx.x < 3) {
+		stage_queue_sizes[threadIdx.x] = 0;
+	}
+	__syncthreads();
+	__shared__ int32_t stage2_offset[CTA_SIZE];
+	__shared__ int32_t stage3_offset[CTA_SIZE];
+	__shared__ int32_t stage2_size[CTA_SIZE];
+	__shared__ int32_t stage3_size[CTA_SIZE];	
+
+	int32_t total_vertices = AccessorType::getSize(input_frontier);
+	int32_t local_vertex_idx = thread_id / (STAGE_1_SIZE);
+	int32_t degree;
+	int32_t s1_offset;
+	int32_t local_vertex;
+	int32_t src_offset;
+	if (local_vertex_idx < total_vertices) {
+		local_vertex = AccessorType::getElement(input_frontier, local_vertex_idx);
+		// Step 1 seggregate vertices into shared buffers
+		degree = graph.d_get_degree(local_vertex);
+		src_offset = graph.d_src_offsets[local_vertex];
+		int32_t s3_size = degree/CTA_SIZE;
+		degree = degree - s3_size * CTA_SIZE;
+		if (s3_size > 0) {
+			if (threadIdx.x % (STAGE_1_SIZE) == 0) {
+				int32_t pos = atomicAggInc(&stage_queue_sizes[2]);
+				stage3_queue[pos] = local_vertex;
+				stage3_size[pos] = s3_size * CTA_SIZE;
+				stage3_offset[pos] = src_offset;
+			}
+		}
+
+		int32_t s2_size = degree/WARP_SIZE;
+		degree = degree - WARP_SIZE * s2_size;
+		if (s2_size > 0) {
+			if (threadIdx.x % (STAGE_1_SIZE) == 0) {
+				int32_t pos = atomicAggInc(&stage_queue_sizes[1]);
+				stage2_queue[pos] = local_vertex;
+				stage2_offset[pos] = s3_size * CTA_SIZE + src_offset;
+				stage2_size[pos] = s2_size * WARP_SIZE;
+			}
+		}
+		s1_offset = s3_size * CTA_SIZE + s2_size * WARP_SIZE + src_offset;
+	} else 
+		local_vertex = -1;
+	__syncthreads();
+	if (local_vertex_idx < total_vertices) {
+		// STAGE 1
+		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < degree + s1_offset; neigh_id += STAGE_1_SIZE) {
+			if (src_filter(local_vertex) == false)
+				break;
+			int32_t dst = graph.d_edge_dst[neigh_id];
+			load_balance_payload(graph, local_vertex, dst, neigh_id, input_frontier, output_frontier);
+		}
+
+	}
+	__syncwarp();
+	// STAGE 2 -- stage 2 is dynamically balanced
+	while(1) {
+		int32_t to_process;
+		if (lane_id == 0) {
+			to_process = atomicSub(&stage_queue_sizes[1], 1) - 1;
+		}
+		to_process = __shfl_sync((uint32_t)-1, to_process, 0, 32);
+		if (to_process < 0)
+			break;
+		local_vertex = stage2_queue[to_process];
+		degree = stage2_size[to_process];
+		int32_t s2_offset = stage2_offset[to_process];
+		for (int32_t neigh_id = s2_offset + (lane_id); neigh_id < degree + s2_offset; neigh_id += WARP_SIZE) {
+			if (src_filter(local_vertex) == false)
+				break;
+			int32_t dst = graph.d_edge_dst[neigh_id];
+			load_balance_payload(graph, local_vertex, dst, neigh_id, input_frontier, output_frontier);	
+		}
+		
+	}	
+	// STAGE 3 -- all threads have to do all, no need for LB
+	for (int32_t wid = 0; wid < stage_queue_sizes[2]; wid++) {
+		local_vertex = stage3_queue[wid];
+		degree = stage3_size[wid];
+		int32_t s3_offset = stage3_offset[wid];
+		for (int32_t neigh_id = s3_offset + (threadIdx.x); neigh_id < degree + s3_offset; neigh_id += CTA_SIZE) {
+			if (src_filter(local_vertex) == false)
+				break;
+			int32_t dst = graph.d_edge_dst[neigh_id];
+			load_balance_payload(graph, local_vertex, dst, neigh_id, input_frontier, output_frontier);	
+		}	
+	}
+}
+template <typename AccessorType>
+void __host__ TWCE_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSizeHost(frontier) * STAGE_1_SIZE;
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
+template <typename AccessorType>
+void __device__ TWCE_load_balance_info_device(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSize(frontier) * STAGE_1_SIZE;
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ TWCE_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	TWCE_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ TWCE_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	TWCE_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
+	TWCE_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ TWCE_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	TWCE_load_balance_info_device<AccessorType>(input_frontier, num_cta, cta_size);
+	this_grid().sync();
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		TWCE_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);	
+		__syncthreads();
+	}
+	this_grid().sync();
+}
+
+// CM load balance functions
+int32_t __device__ binary_search_upperbound(int32_t *array, int32_t len, int32_t key){
+	int32_t s = 0;
+	while(len>0){
+		int32_t half = len>>1;
+		int32_t mid = s + half;
+		if(array[mid] > key){
+			len = half;
+		}else{
+			s = mid+1;
+			len = len-half-1;
+		}
+	}
+	return s;
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ CM_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+
+	__shared__ int32_t sm_idx[CTA_SIZE], sm_deg[CTA_SIZE], sm_loc[CTA_SIZE];
+	int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+	int32_t tot_size = AccessorType::getSize(input_frontier);
+
+        int32_t deg, index, src_idx;
+        if(thread_id < tot_size) {
+		index = AccessorType::getElement(input_frontier, thread_id);
+                deg = graph.d_get_degree(index);
+
+		sm_idx[threadIdx.x] = index;
+                sm_deg[threadIdx.x] = deg;
+                sm_loc[threadIdx.x] = graph.d_src_offsets[index];
+        } else {
+                deg = 0;
+                sm_deg[threadIdx.x] = deg;
+        }
+
+        int32_t lane = (threadIdx.x & 31);
+        int32_t offset = 0;
+	
+	// prefix sum
+	int32_t cosize = blockDim.x;
+	int32_t tot_deg;
+	int32_t phase = threadIdx.x;
+	int32_t off=32;
+
+	for(int32_t d=2; d<=32; d<<=1) {
+		int32_t temp = __shfl_up_sync((uint32_t)-1, deg, d/2);
+		if (lane % d == d - 1) deg += temp;
+	}
+	sm_deg[threadIdx.x] = deg;
+
+	for(int32_t d=cosize>>(1+5); d>0; d>>=1){
+		__syncthreads();
+		if(phase<d){
+			int32_t ai = off*(2*phase+1)-1;
+			int32_t bi = off*(2*phase+2)-1;
+			sm_deg[bi] += sm_deg[ai];
+		}
+		off<<=1;
+	}
+
+	__syncthreads();
+	tot_deg = sm_deg[cosize-1];
+	__syncthreads();
+	if(!phase) sm_deg[cosize-1]=0;
+	__syncthreads();
+
+	for(int32_t d=1; d<(cosize>>5); d<<=1){
+		off >>=1;
+		__syncthreads();
+		if(phase<d){
+			int32_t ai = off*(2*phase+1)-1;
+			int32_t bi = off*(2*phase+2)-1;
+
+			int32_t t = sm_deg[ai];
+			sm_deg[ai]  = sm_deg[bi];
+			sm_deg[bi] += t;
+		}
+	}
+	__syncthreads();
+	deg = sm_deg[threadIdx.x];
+	__syncthreads();
+	for(int32_t d=32; d>1; d>>=1) {
+		int32_t temp_big = __shfl_down_sync((uint32_t)-1, deg, d/2);
+		int32_t temp_small = __shfl_up_sync((uint32_t)-1, deg, d/2);
+		if (lane % d == d/2 - 1) deg = temp_big;
+		else if(lane % d == d - 1) deg += temp_small;
+	}
+	sm_deg[threadIdx.x] = deg;
+	__syncthreads();
+	
+	// compute
+        int32_t width = thread_id - threadIdx.x + blockDim.x;
+        if(tot_size < width) width = tot_size;
+        width -= thread_id - threadIdx.x;
+
+        for(int32_t i=threadIdx.x; i<tot_deg; i+=blockDim.x) {
+                int32_t id = binary_search_upperbound(&sm_deg[offset], width, i)-1;
+
+                if(id >= width) continue;
+                src_idx = sm_idx[offset + id];
+		if (src_filter(src_idx) == false)
+			continue;
+                int32_t ei = sm_loc[offset + id] + i - sm_deg[offset + id];
+                int32_t dst_idx = graph.d_edge_dst[ei];
+		load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
+        }
+}
+template <typename AccessorType>
+void __host__ CM_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSizeHost(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
+template <typename AccessorType>
+void __device__ CM_load_balance_info_device(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSize(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ CM_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	CM_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ CM_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	CM_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
+	CM_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ CM_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	CM_load_balance_info_device<AccessorType>(input_frontier, num_cta, cta_size);
+	this_grid().sync();
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		CM_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);	
+		__syncthreads();
+	}
+	this_grid().sync();
+}
+
+
+// WM load balance functions
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ WM_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+
+	__shared__ int32_t sm_idx[CTA_SIZE], sm_deg[CTA_SIZE], sm_loc[CTA_SIZE];
+	int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+	int32_t tot_size = AccessorType::getSize(input_frontier);
+
+        int32_t deg, index, src_idx;
+        if(thread_id < tot_size) {
+		index = AccessorType::getElement(input_frontier, thread_id);
+                deg = graph.d_get_degree(index);
+
+		sm_idx[threadIdx.x] = index;
+                sm_deg[threadIdx.x] = deg;
+                sm_loc[threadIdx.x] = graph.d_src_offsets[index];
+        } else {
+                deg = 0;
+                sm_deg[threadIdx.x] = deg;
+        }
+
+        // prefix sum
+        int32_t lane = (threadIdx.x&31);
+        int32_t offset = threadIdx.x - lane;
+        for(int32_t d=1; d<32; d<<=1) {
+                int32_t temp = __shfl_up_sync((uint32_t)-1, deg, d);
+                if (lane >= d) deg += temp;
+        }
+        int32_t tot_deg = __shfl_sync((uint32_t)-1, deg, 31);
+        if(lane == 31) deg = 0;
+        sm_deg[offset + ((lane+1)&31)] = deg;
+        __syncthreads();
+
+        // compute
+        int32_t width = thread_id - lane + 32;
+        if(tot_size < width) width = tot_size;
+        width -= thread_id - lane;
+
+        for(int32_t i=lane; i<tot_deg; i+=32) {
+                int32_t id = binary_search_upperbound(&sm_deg[offset], width, i)-1;
+
+                src_idx = sm_idx[offset + id];
+		if (src_filter(src_idx) == false)
+			continue;
+
+                int32_t ei = sm_loc[offset + id] + i - sm_deg[offset + id];
+                int32_t dst_idx = graph.d_edge_dst[ei];
+		load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
+        }
+}
+template <typename AccessorType>
+void __host__ WM_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSizeHost(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
+template <typename AccessorType>
+void __device__ WM_load_balance_info_device(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSize(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ WM_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	WM_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ WM_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	WM_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
+	WM_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ WM_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	WM_load_balance_info_device<AccessorType>(input_frontier, num_cta, cta_size);
+	this_grid().sync();
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		WM_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);	
+		__syncthreads();
+	}
+	this_grid().sync();
+}
+
+//TWCE load balance functions
+#define MID_BIN (32)
+#define LARGE_BIN (CTA_SIZE)
+
+template <typename EdgeWeightType, typename AccessorType>
+void __device__ TWC_split_frontier (GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, unsigned int cta_id, unsigned int num_cta) {
+        int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+        int32_t tot_size = AccessorType::getSize(input_frontier);
+	int32_t idx, deg;
+	if(thread_id < tot_size) {
+		idx = AccessorType::getElement(input_frontier, thread_id);
+		deg = graph.d_get_degree(idx);
+		if(deg < MID_BIN) {
+			int32_t k = atomicAggInc(&graph.twc_bin_sizes[0]);
+			graph.twc_small_bin[k] = idx;
+		} else if(deg < LARGE_BIN) {
+			int32_t k = atomicAggInc(&graph.twc_bin_sizes[1]);
+			graph.twc_mid_bin[k] = idx;
+		} else {
+			int32_t k = atomicAggInc(&graph.twc_bin_sizes[2]);
+			graph.twc_large_bin[k] = idx;
+		}
+	}	
+}
+template <typename EdgeWeightType, typename AccessorType>
+void __global__ TWC_split_frontier_kernel (GraphT<EdgeWeightType> graph, VertexFrontier input_frontier) {
+	TWC_split_frontier<EdgeWeightType, AccessorType> (graph, input_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ TWC_small_bin (GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+
+	__shared__ int32_t sm_idx[CTA_SIZE], sm_deg[CTA_SIZE], sm_loc[CTA_SIZE];
+	int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+	int32_t tot_size = graph.twc_bin_sizes[0];
+
+        int32_t deg, index, src_idx;
+        if(thread_id < tot_size) {
+		index = graph.twc_small_bin[thread_id];
+                deg = graph.d_get_degree(index);
+
+		sm_idx[threadIdx.x] = index;
+                sm_deg[threadIdx.x] = deg;
+                sm_loc[threadIdx.x] = graph.d_src_offsets[index];
+        } else {
+                deg = 0;
+                sm_deg[threadIdx.x] = deg;
+        }
+
+        // prefix sum
+        int32_t lane = (threadIdx.x&31);
+        int32_t offset = threadIdx.x - lane;
+        for(int32_t d=1; d<32; d<<=1) {
+                int32_t temp = __shfl_up_sync((uint32_t)-1, deg, d);
+                if (lane >= d) deg += temp;
+        }
+        int32_t tot_deg = __shfl_sync((uint32_t)-1, deg, 31);
+        if(lane == 31) deg = 0;
+        sm_deg[offset + ((lane+1)&31)] = deg;
+        __syncthreads();
+
+        // compute
+        int32_t width = thread_id - lane + 32;
+        if(tot_size < width) width = tot_size;
+        width -= thread_id - lane;
+
+        for(int32_t i=lane; i<tot_deg; i+=32) {
+                int32_t id = binary_search_upperbound(&sm_deg[offset], width, i)-1;
+
+                src_idx = sm_idx[offset + id];
+		if (src_filter(src_idx) == false)
+			continue;
+
+                int32_t ei = sm_loc[offset + id] + i - sm_deg[offset + id];
+                int32_t dst_idx = graph.d_edge_dst[ei];
+		load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
+        }
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ TWC_small_bin_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	TWC_small_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+	
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ TWC_mid_bin (GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+	int32_t vid = (threadIdx.x + blockDim.x * cta_id)/MID_BIN;
+	int32_t tot_size = graph.twc_bin_sizes[1];
+	
+	if (vid >= tot_size)
+		return;
+
+	int32_t src = graph.twc_mid_bin[vid];
+	for (int32_t eid = graph.d_src_offsets[src]+(threadIdx.x%MID_BIN); eid < graph.d_src_offsets[src+1]; eid+=MID_BIN) {
+		if (src_filter(src) == false)
+			break;
+		int32_t dst = graph.d_edge_dst[eid];
+		load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);
+	}
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ TWC_mid_bin_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	TWC_mid_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+	
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ TWC_large_bin (GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+	int32_t vid = (threadIdx.x + blockDim.x * cta_id)/LARGE_BIN;
+	int32_t tot_size = graph.twc_bin_sizes[2];
+	if (vid >= tot_size)
+		return;
+	int32_t src = graph.twc_large_bin[vid];
+	for (int32_t eid = graph.d_src_offsets[src]+(threadIdx.x%LARGE_BIN); eid < graph.d_src_offsets[src+1]; eid+=LARGE_BIN) {
+		if (src_filter(src) == false)
+			break;
+		int32_t dst = graph.d_edge_dst[eid];
+		load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);
+	}
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ TWC_large_bin_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	TWC_large_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+	
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ TWC_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	cudaMemset(graph.twc_bin_sizes, 0, sizeof(int32_t) * 3);
+	int num_threads = AccessorType::getSizeHost(input_frontier);	
+	int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	int cta_size = CTA_SIZE;
+	TWC_split_frontier_kernel<EdgeWeightType, AccessorType><<<num_cta, cta_size>>>(graph, input_frontier);
+	int32_t twc_bin_sizes[3];
+	cudaMemcpy(twc_bin_sizes, graph.twc_bin_sizes, 3 * sizeof(int32_t), cudaMemcpyDeviceToHost);
+	num_threads = twc_bin_sizes[0];	
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	if (num_cta)
+		TWC_small_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 
+	num_threads = twc_bin_sizes[1] * MID_BIN;	
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	if (num_cta)
+		TWC_mid_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 
+	num_threads = twc_bin_sizes[2] * LARGE_BIN;	
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	if (num_cta)
+		TWC_large_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 	
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ TWC_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread_id < 3) {
+		graph.twc_bin_sizes[thread_id] = 0;
+	}	
+	this_grid().sync();
+
+	int num_threads = AccessorType::getSize(input_frontier);	
+	int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		TWC_split_frontier<EdgeWeightType, AccessorType>(graph, input_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+
+	this_grid().sync();	
+
+	num_threads = graph.twc_bin_sizes[0];	
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		TWC_small_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+
+	num_threads = graph.twc_bin_sizes[1] * MID_BIN;
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		TWC_mid_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+
+	num_threads = graph.twc_bin_sizes[2] * LARGE_BIN;
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		TWC_large_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+	
+	this_grid().sync();
+}
+
+// STRICT LOAD BALANCE FUNCTIONS
+
+#define NNZ_PER_BLOCK (CTA_SIZE)
+#define STRICT_SM_SIZE (CTA_SIZE)
+#define PREFIX_BLK (CTA_SIZE)
+
+template <typename AccessorType, typename EdgeWeightType>
+void __device__ strict_gather(GraphT<EdgeWeightType> &graph, VertexFrontier &frontier, unsigned int cta_id, unsigned int num_cta) {
+        int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+        int32_t tot_size = AccessorType::getSize(frontier);
+	int32_t idx;
+	if(thread_id < tot_size) {
+		idx = AccessorType::getElement(frontier, thread_id);
+		graph.strict_sum[thread_id] = graph.d_get_degree(idx);
+	}
+}
+
+template <typename AccessorType, typename EdgeWeightType>
+void __global__ strict_gather_kernel(GraphT<EdgeWeightType> graph, VertexFrontier frontier) {
+	strict_gather<AccessorType, EdgeWeightType>(graph, frontier, blockIdx.x, gridDim.x);
+}
+void __device__ strict_get_partial_sum(int32_t *elt, int32_t *buf, int32_t f_size, int32_t nnz_per_blk, unsigned int cta_id, unsigned int num_cta)
+{
+	int32_t idx = cta_id*nnz_per_blk + threadIdx.x;
+	int32_t upper_idx = (cta_id+1)*nnz_per_blk;
+	if(upper_idx > f_size) upper_idx = f_size;
+	int32_t accum=0;
+
+	__shared__ int32_t sm_accum[32];
+	for(int32_t i=idx; i<upper_idx; i+=blockDim.x) {
+		accum += elt[i];
+	}
+	accum += __shfl_down_sync((uint32_t)-1, accum, 16);
+	accum += __shfl_down_sync((uint32_t)-1, accum, 8);
+	accum += __shfl_down_sync((uint32_t)-1, accum, 4);
+	accum += __shfl_down_sync((uint32_t)-1, accum, 2);
+	accum += __shfl_down_sync((uint32_t)-1, accum, 1);
+	if(threadIdx.x % 32 == 0) {
+		sm_accum[threadIdx.x/32] = accum;
+	}
+	__syncthreads();
+	if(threadIdx.x < PREFIX_BLK/32) {
+		accum = sm_accum[threadIdx.x];
+	} else {
+		accum = 0;
+	}
+	__syncwarp();
+	if(threadIdx.x < 32) {
+		accum += __shfl_down_sync((uint32_t)-1, accum, 16);
+		accum += __shfl_down_sync((uint32_t)-1, accum, 8);
+		accum += __shfl_down_sync((uint32_t)-1, accum, 4);
+		accum += __shfl_down_sync((uint32_t)-1, accum, 2);
+		accum += __shfl_down_sync((uint32_t)-1, accum, 1);
+	}
+	__syncthreads();
+	if(threadIdx.x == 0) {
+		buf[cta_id] = accum;
+	}
+}
+void __global__ strict_get_partial_sum_kernel(int32_t *elt, int32_t *buf, int32_t f_size, int32_t nnz_per_blk) {
+	strict_get_partial_sum(elt, buf, f_size, nnz_per_blk, blockIdx.x, gridDim.x);
+}
+
+void __device__ strict_local_prefix_sum(int32_t *elt, int32_t *buf, int32_t *glt, int32_t prefix_mode, int32_t f_size, int32_t nnz_per_blk, unsigned int cta_id, unsigned int num_cta) {
+	__shared__ int32_t sm_deg[PREFIX_BLK];
+
+	int32_t lane = (threadIdx.x&31);
+
+	// prefix sum
+	int32_t cosize = blockDim.x;
+	int32_t tot_deg;
+	int32_t phase = threadIdx.x;
+	int32_t off=32;
+
+	int32_t base_offset = 0;
+	if(cta_id > 0) base_offset = buf[cta_id];
+
+	int32_t idx = cta_id*nnz_per_blk + threadIdx.x;
+	int32_t upper_idx = (cta_id+1)*nnz_per_blk;
+	if(upper_idx > f_size) upper_idx = f_size;
+
+	for(int32_t i=idx; i<(cta_id+1)*nnz_per_blk; i += blockDim.x) {
+		int32_t deg = 0;
+		if(i < upper_idx) deg = elt[i];
+
+		for(int32_t d=2; d<=32; d<<=1) {
+			int32_t temp = __shfl_up_sync((uint32_t)-1, deg, d/2);
+			if (lane % d == d - 1) deg += temp;
+		}
+		sm_deg[threadIdx.x] = deg;
+
+		for(int32_t d=cosize>>(1+5); d>0; d>>=1){
+			__syncthreads();
+			if(phase<d){
+				int32_t ai = off*(2*phase+1)-1;
+				int32_t bi = off*(2*phase+2)-1;
+				sm_deg[bi] += sm_deg[ai];
+			}
+			off<<=1;
+		}
+
+		__syncthreads();
+		tot_deg = sm_deg[cosize-1];
+		__syncthreads();
+		if(!phase) sm_deg[cosize-1]=0;
+		__syncthreads();
+
+		for(int32_t d=1; d<(cosize>>5); d<<=1){
+			off >>=1;
+			__syncthreads();
+			if(phase<d){
+				int32_t ai = off*(2*phase+1)-1;
+				int32_t bi = off*(2*phase+2)-1;
+
+				int32_t t = sm_deg[ai];
+				sm_deg[ai]  = sm_deg[bi];
+				sm_deg[bi] += t;
+			}
+		}
+		__syncthreads();
+		deg = sm_deg[threadIdx.x];
+		__syncthreads();
+		for(int32_t d=32; d>1; d>>=1) {
+			int32_t temp_big = __shfl_down_sync((uint32_t)-1, deg, d/2);
+			int32_t temp_small = __shfl_up_sync((uint32_t)-1, deg, d/2);
+			if (lane % d == d/2 - 1) deg = temp_big;
+			else if(lane % d == d - 1) deg += temp_small;
+		}
+		//sm_deg[threadIdx.x] = deg;
+		if(i < upper_idx) {
+			elt[i] = base_offset + deg;
+		}
+		__syncthreads();
+		base_offset += tot_deg;
+
+	}
+	__syncthreads();
+	if (prefix_mode == 1 && threadIdx.x == 0) {
+		glt[0] = base_offset;
+	}
+}
+void __global__ strict_local_prefix_sum_kernel(int32_t *elt, int32_t *buf, int32_t *glt, int32_t prefix_mode, int32_t f_size, int32_t nnz_per_blk) {
+	strict_local_prefix_sum(elt, buf, glt, prefix_mode, f_size, nnz_per_blk, blockIdx.x, gridDim.x);
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+
+	__shared__ int32_t sm_idx[STRICT_SM_SIZE], sm_deg[STRICT_SM_SIZE], sm_loc[STRICT_SM_SIZE];
+	//int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+	int32_t tot_size = AccessorType::getSize(input_frontier);
+
+        int32_t index, src_idx;
+	//int32_t deg;
+
+	// if(cta_id == num_cta - 1) return;
+	// can be fused
+	//bool last_tb = (cta_id == (graph.strict_grid_sum[0] + NNZ_PER_BLOCK-1)/NNZ_PER_BLOCK-1);
+	int32_t start_row = binary_search_upperbound(&graph.strict_sum[0], tot_size, NNZ_PER_BLOCK*cta_id)-1;
+	int32_t end_row = binary_search_upperbound(&graph.strict_sum[0], tot_size, NNZ_PER_BLOCK*(cta_id+1))-1;
+
+	int32_t row_size = end_row - start_row + 1;
+	//int32_t start_idx;
+
+	//if(row_size <= STRICT_SM_SIZE) {
+	if(row_size <= -1 ) {
+		if(threadIdx.x < row_size) {
+			index = AccessorType::getElement(input_frontier, start_row+threadIdx.x);
+			//deg = graph.d_get_degree(index);
+
+			sm_idx[threadIdx.x] = index;
+			int32_t tmp_deg = graph.strict_sum[start_row + threadIdx.x] - cta_id * NNZ_PER_BLOCK;
+			if(tmp_deg >= 0) {
+				sm_deg[threadIdx.x] = tmp_deg;
+				sm_loc[threadIdx.x] = graph.d_src_offsets[index];
+			} else {
+				sm_deg[threadIdx.x] = 0;
+				sm_loc[threadIdx.x] = graph.d_src_offsets[index] - tmp_deg;
+			}
+		} else {
+			//deg = 0;
+			sm_deg[threadIdx.x] = INT_MAX;
+		}
+		__syncthreads();
+
+		//int32_t lane = (threadIdx.x&31);
+		int32_t offset = 0;
+
+
+		int32_t tot_deg = graph.strict_grid_sum[0] - cta_id * NNZ_PER_BLOCK;
+		if(tot_deg > NNZ_PER_BLOCK) tot_deg = NNZ_PER_BLOCK;
+		//int32_t tot_deg;
+		//if(!last_tb) tot_deg = NNZ_PER_BLOCK;
+		//else tot_deg = (graph.strict_grid_sum[0] - 1) % NNZ_PER_BLOCK + 1;
+
+		//int32_t phase = threadIdx.x;
+		//int32_t off=32;
+
+		int32_t width = row_size;
+		for(int32_t i=threadIdx.x; i<tot_deg; i+=blockDim.x) {
+			int32_t id = binary_search_upperbound(&sm_deg[offset], width, i)-1;
+			if(id >= width) continue;
+			src_idx = sm_idx[offset + id];
+			if (src_filter(src_idx) == false)
+				continue;
+			int32_t ei = sm_loc[offset + id] + i - sm_deg[offset + id];
+			if(ei >= graph.num_edges) break;
+			int32_t dst_idx = graph.d_edge_dst[ei];
+			load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
+		}
+	} else {
+		int32_t tot_deg = graph.strict_grid_sum[0] - cta_id * NNZ_PER_BLOCK;
+		if(tot_deg > NNZ_PER_BLOCK) tot_deg = NNZ_PER_BLOCK;
+		//if(!last_tb) tot_deg = NNZ_PER_BLOCK;
+		//else tot_deg = (graph.strict_grid_sum[0] - 1) % NNZ_PER_BLOCK + 1;
+
+		int32_t width = row_size;
+		//int32_t offset = 0;
+
+		for(int32_t i=cta_id*NNZ_PER_BLOCK+threadIdx.x; i<cta_id*NNZ_PER_BLOCK+tot_deg; i+=blockDim.x) {
+			int32_t id = binary_search_upperbound(&graph.strict_sum[start_row], width, i)-1;
+			if(id >= width) continue;
+			src_idx = AccessorType::getElement(input_frontier, start_row+id);
+			if (src_filter(src_idx) == false)
+				continue;
+			int32_t ei = graph.d_src_offsets[src_idx] + i - graph.strict_sum[start_row + id];
+			if(ei >= graph.num_edges) break;
+			int32_t dst_idx = graph.d_edge_dst[ei];
+			load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
+		}
+
+
+	}
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ strict_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	strict_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ strict_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int num_threads = AccessorType::getSizeHost(input_frontier);	
+	int num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
+	int cta_size = CTA_SIZE;	
+	strict_gather_kernel<AccessorType, EdgeWeightType><<<num_cta, cta_size>>>(graph, input_frontier);
+	
+	int32_t tot_blk = NUM_CTA;	
+	int32_t low_blk = (num_threads + PREFIX_BLK - 1)/PREFIX_BLK;
+	if (low_blk < tot_blk)
+		tot_blk = low_blk;	
+	
+	int32_t gran = PREFIX_BLK * tot_blk;
+	int32_t nnz_per_thread = (num_threads + gran - 1)/gran;
+	int32_t nnz_per_blk = (nnz_per_thread * PREFIX_BLK);
+
+
+	strict_get_partial_sum_kernel<<<tot_blk, PREFIX_BLK>>>(graph.strict_sum, graph.strict_cta_sum, num_threads, nnz_per_blk);
+	
+	strict_local_prefix_sum_kernel<<<1, PREFIX_BLK>>>(graph.strict_cta_sum, graph.strict_cta_sum, graph.strict_grid_sum, 1, tot_blk + 1, (tot_blk + PREFIX_BLK)/PREFIX_BLK * PREFIX_BLK);
+	strict_local_prefix_sum_kernel<<<tot_blk, PREFIX_BLK>>>(graph.strict_sum, graph.strict_cta_sum, graph.strict_grid_sum, 0, num_threads, nnz_per_blk);
+	cudaMemcpy(&num_threads, graph.strict_grid_sum, sizeof(int32_t), cudaMemcpyDeviceToHost);
+	cudaCheckLastError();
+	num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
+	cta_size = CTA_SIZE;	
+
+	strict_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);	
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ strict_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int num_threads = AccessorType::getSize(input_frontier);	
+	int num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
+
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {	
+		strict_gather<AccessorType, EdgeWeightType>(graph, input_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+	this_grid().sync();
+	
+	int32_t tot_blk = NUM_CTA;	
+	int32_t low_blk = (num_threads + PREFIX_BLK - 1)/PREFIX_BLK;
+	if (low_blk < tot_blk)
+		tot_blk = low_blk;	
+	int32_t gran = PREFIX_BLK * tot_blk;
+	int32_t nnz_per_thread = (num_threads + gran - 1)/gran;
+	int32_t nnz_per_blk = (nnz_per_thread * PREFIX_BLK);
+
+	for (int32_t cta_id = blockIdx.x; cta_id < tot_blk; cta_id += gridDim.x) {	
+		strict_get_partial_sum(graph.strict_sum, graph.strict_cta_sum, num_threads, nnz_per_blk, cta_id, tot_blk);
+		__syncthreads();
+	}
+	this_grid().sync();
+	if (blockIdx.x == 0) {
+		strict_local_prefix_sum(graph.strict_cta_sum, graph.strict_cta_sum, graph.strict_grid_sum, 1, tot_blk + 1, (tot_blk + PREFIX_BLK)/PREFIX_BLK * PREFIX_BLK, blockIdx.x, 1);
+	}	
+	this_grid().sync();
+	for (int32_t cta_id = blockIdx.x; cta_id < tot_blk; cta_id += gridDim.x) {	
+		strict_local_prefix_sum(graph.strict_sum, graph.strict_cta_sum, graph.strict_grid_sum, 0, num_threads, nnz_per_blk, cta_id, tot_blk);
+		__syncthreads();
+	}
+	this_grid().sync();
+	num_threads = graph.strict_grid_sum[0];
+	num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {	
+		strict_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+	this_grid().sync();
+	
+}
+
+}
+#endif
diff --git a/src/runtime_lib/infra_gpu/printer.h b/src/runtime_lib/infra_gpu/printer.h
new file mode 100755
index 00000000..5b9cda01
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/printer.h
@@ -0,0 +1,17 @@
+#ifndef GRAPHIT_GPU_PRINTER
+#define GRAPHIT_GPU_PRINTER
+#include <string>
+
+namespace gpu_runtime {
+void __device__ print(int32_t val) {
+	printf("%d\n", val);
+}
+void __device__ print(float val) {
+	printf("%f\n", val);
+}
+void __device__ print(const char* val) {
+	printf("%s\n", val);
+}
+}
+
+#endif
diff --git a/src/runtime_lib/infra_gpu/support.h b/src/runtime_lib/infra_gpu/support.h
new file mode 100755
index 00000000..bc6fe394
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/support.h
@@ -0,0 +1,65 @@
+#ifndef GRAPHIT_GPU_SUPPORT_H
+#define GRAPHIT_GPU_SUPPORT_H
+namespace gpu_runtime {
+void cudaCheckLastError(void) {
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+                printf("Error: %s\n", cudaGetErrorString(err));
+		exit(-1);
+	}
+}
+__device__ inline int32_t warp_bcast(int32_t mask, int32_t v, int32_t leader) {
+	return __shfl_sync((uint32_t)mask, v, leader); 
+}
+__device__ inline int32_t atomicAggInc(int32_t *ctr) {
+	int32_t lane_id = threadIdx.x % 32;
+	
+        int mask = __activemask();
+        int leader = __ffs(mask) - 1;
+        int res;
+        if(lane_id == leader)
+                res = atomicAdd(ctr, __popc(mask));
+        res = warp_bcast(mask, res, leader);
+
+        return (res + __popc(mask & ((1 << lane_id) - 1)));
+}
+template <typename T>
+static bool __device__ writeMin(T *dst, T src) {
+	if (*dst <= src)
+		return false;
+	T old_value = atomicMin(dst, src);
+	bool ret = (old_value > src);
+	return ret;
+}
+template <typename T>
+static bool __device__ writeMax(T *dst, T src) {
+	if (*dst >= src)
+		return false;
+	T old_value = atomicMax(dst, src);
+	bool ret = (old_value < src);
+	return ret;
+}
+
+
+template <typename T>
+static bool __device__ writeAdd(T *dst, T src) {
+	atomicAdd(dst, src);
+	return true;
+}
+template <typename T>
+static bool __device__ CAS(T *dst, T old_val, const T &new_val) {
+	if (*dst != old_val)
+		return false;
+	return old_val == atomicCAS(dst, old_val, new_val);
+}
+static void __device__ parallel_memset(unsigned char* dst, unsigned char val, size_t total_bytes) {
+	for (size_t index = threadIdx.x + blockDim.x * blockIdx.x; index < total_bytes; index += blockDim.x * gridDim.x)
+		dst[index] = val;
+}
+static void __device__ parallel_memcpy(unsigned char* dst, unsigned char* src, size_t total_bytes) {
+	for (size_t index = threadIdx.x + blockDim.x * blockIdx.x; index < total_bytes; index += blockDim.x * gridDim.x)
+		dst[index] = src[index];
+}
+}
+
+#endif
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
new file mode 100644
index 00000000..c5f2d53d
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -0,0 +1,417 @@
+#ifndef GPU_VERTEX_FRONTIER_H
+#define GPU_VERTEX_FRONTIER_H
+
+#include "infra_gpu/support.h"
+#include <cooperative_groups.h>
+#ifndef FRONTIER_MULTIPLIER
+#define FRONTIER_MULTIPLIER (6)
+#endif
+using namespace cooperative_groups;
+namespace gpu_runtime {
+class VertexFrontier {
+
+ public:
+  
+	int32_t max_num_elems; 
+
+	int32_t *d_num_elems_input;
+	int32_t *d_num_elems_output;
+
+	int32_t * d_sparse_queue_input;
+	int32_t * d_sparse_queue_output;
+	
+	unsigned char* d_byte_map_input;
+	unsigned char* d_byte_map_output;
+
+	uint32_t* d_bit_map_input;
+	uint32_t* d_bit_map_output;
+
+	int32_t *d_dedup_counters;
+	int32_t curr_dedup_counter;
+
+	// Extend this to check the current representation
+	enum format_ready_type {
+		SPARSE,
+		BITMAP,
+		BYTEMAP		
+	};
+
+	format_ready_type format_ready;
+	
+	// PriorityQueue related trackers
+	int32_t* d_priority_array;
+	int32_t priority_cutoff;
+};
+
+
+static void cudaFreeSafe(void* ptr) {
+	cudaFree(ptr);
+}
+void delete_vertex_frontier(VertexFrontier &frontier) {
+	cudaFreeSafe(frontier.d_sparse_queue_input);	
+	cudaFreeSafe(frontier.d_sparse_queue_output);
+	cudaFreeSafe(frontier.d_num_elems_input);
+	cudaFreeSafe(frontier.d_num_elems_output);
+	cudaFreeSafe(frontier.d_byte_map_input);
+	cudaFreeSafe(frontier.d_byte_map_output);
+	cudaFreeSafe(frontier.d_bit_map_input);
+	cudaFreeSafe(frontier.d_bit_map_output);
+	cudaFreeSafe(frontier.d_dedup_counters);
+	return;
+}
+static VertexFrontier sentinel_frontier;
+static __device__ VertexFrontier device_sentinel_frontier;
+
+static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
+	int32_t curr_size = 0;
+	cudaMemcpy(&curr_size, frontier.d_num_elems_input, sizeof(int32_t), cudaMemcpyDeviceToHost);
+	return curr_size;	
+}
+static int32_t __device__ device_builtin_getVertexSetSize(VertexFrontier &frontier) {
+	this_grid().sync();
+	return frontier.d_num_elems_input[0];
+}
+class AccessorSparse {
+public:
+	static int32_t __device__ getSize(VertexFrontier &frontier) {
+		return frontier.d_num_elems_input[0];
+	}
+	static int32_t __device__ getElement(VertexFrontier &frontier, int32_t index) {
+		return frontier.d_sparse_queue_input[index];
+	}
+	static int32_t getSizeHost(VertexFrontier &frontier) {
+		return builtin_getVertexSetSize(frontier);
+	}
+};
+class AccessorAll {
+public:
+	static int32_t __device__ getSize(VertexFrontier &frontier) {
+		return frontier.max_num_elems;
+	}
+	static int32_t __device__ getElement(VertexFrontier &frontier, int32_t index) {
+		return index;
+	}
+	static int32_t getSizeHost(VertexFrontier &frontier) {
+		return frontier.max_num_elems;
+	}
+};
+
+void __global__ initialize_frontier_all(VertexFrontier frontier) {
+	for (int32_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < frontier.max_num_elems; idx += blockDim.x * gridDim.x)
+		frontier.d_sparse_queue_input[idx] = idx;
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+		frontier.d_num_elems_input[0] = frontier.max_num_elems;
+	}
+}
+static VertexFrontier create_new_vertex_set(int32_t num_vertices, int32_t init_elems = 0) {
+	VertexFrontier frontier;
+	frontier.max_num_elems = num_vertices;
+	cudaMalloc(&frontier.d_num_elems_input, sizeof(int32_t));
+	cudaMalloc(&frontier.d_num_elems_output, sizeof(int32_t));
+	cudaMalloc(&frontier.d_sparse_queue_input, sizeof(int32_t) * num_vertices * FRONTIER_MULTIPLIER);
+	cudaMalloc(&frontier.d_sparse_queue_output, sizeof(int32_t) * num_vertices * FRONTIER_MULTIPLIER);
+	
+	if (num_vertices == init_elems) {
+		initialize_frontier_all<<<NUM_CTA, CTA_SIZE>>>(frontier);		
+	} else {
+		cudaMemset(frontier.d_num_elems_input, 0, sizeof(int32_t));
+	}
+	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));
+
+
+	cudaMalloc(&frontier.d_byte_map_input, sizeof(unsigned char) * num_vertices);
+	cudaMalloc(&frontier.d_byte_map_output, sizeof(unsigned char) * num_vertices);
+	
+	cudaMemset(frontier.d_byte_map_input, 0, sizeof(unsigned char) * num_vertices);
+	cudaMemset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * num_vertices);
+	
+	int32_t num_byte_for_bitmap = (num_vertices + sizeof(uint32_t) * 8 - 1)/(sizeof(uint32_t) * 8);
+	cudaMalloc(&frontier.d_bit_map_input, sizeof(uint32_t) * num_byte_for_bitmap);
+	cudaMalloc(&frontier.d_bit_map_output, sizeof(uint32_t) * num_byte_for_bitmap);
+	
+	cudaMemset(frontier.d_bit_map_input, 0, sizeof(uint32_t) * num_byte_for_bitmap);	
+	cudaMemset(frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);	
+	cudaCheckLastError();
+
+
+	frontier.curr_dedup_counter = 0;
+	cudaMalloc(&frontier.d_dedup_counters, sizeof(int32_t) * num_vertices);
+	cudaMemset(frontier.d_dedup_counters, 0, sizeof(int32_t) * num_vertices);
+
+	frontier.format_ready = VertexFrontier::SPARSE;
+
+	cudaCheckLastError();
+
+	return frontier;
+}
+
+static void builtin_addVertex(VertexFrontier &frontier, int32_t vid) {
+	int32_t curr_size;
+	cudaMemcpy(&curr_size, frontier.d_num_elems_input, sizeof(int32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(frontier.d_sparse_queue_input + curr_size, &vid, sizeof(int32_t), cudaMemcpyHostToDevice);
+	curr_size++;
+	
+	cudaMemcpy(frontier.d_num_elems_input, &curr_size, sizeof(int32_t), cudaMemcpyHostToDevice);
+}
+static void __device__ enqueueVertexSparseQueue(int32_t *sparse_queue, int32_t *sparse_queue_size, int32_t vertex_id) {
+	int32_t pos = atomicAggInc(sparse_queue_size);
+	sparse_queue[pos] = vertex_id;
+}
+static void __device__ enqueueVertexSparseQueueDedup(int32_t *sparse_queue, int32_t *sparse_queue_size, int32_t vertex_id, VertexFrontier &frontier) {
+	int32_t vid = vertex_id;
+	if (frontier.d_dedup_counters[vid] < frontier.curr_dedup_counter) {
+		int32_t pos = atomicAggInc(sparse_queue_size);
+		sparse_queue[pos] = vertex_id;
+		frontier.d_dedup_counters[vid] = frontier.curr_dedup_counter;
+	}
+}
+static void __device__ enqueueVertexSparseQueueDedupPerfect(int32_t *sparse_queue, int32_t *sparse_queue_size, int32_t vertex_id, VertexFrontier &frontier) {
+	int32_t vid = vertex_id;
+	if (writeMax(&frontier.d_dedup_counters[vid], frontier.curr_dedup_counter)) {
+		int32_t pos = atomicAggInc(sparse_queue_size);
+		sparse_queue[pos] = vertex_id;
+	}
+}
+static void __device__ enqueueVertexBytemap(unsigned char* byte_map, int32_t *byte_map_size, int32_t vertex_id) {
+	// We are not using atomic operation here because races are benign here
+	if (byte_map[vertex_id] == 1)
+		return;
+	byte_map[vertex_id] = 1;
+	atomicAggInc(byte_map_size);
+}
+static bool __device__ checkBit(uint32_t* array, int32_t index) {	
+	uint32_t * address = array + index / (8 * sizeof(uint32_t));
+	return (*address & (1 << (index % (8 * sizeof(uint32_t)))));
+}
+static bool __device__ setBit(uint32_t* array, int32_t index) {
+	uint32_t * address = array + index / (8 * sizeof(uint32_t));
+	return atomicOr(address, (1 << (index % (8 * sizeof(uint32_t))))) & (1 << (index % (8 * sizeof(uint32_t))));
+}
+static void __device__ enqueueVertexBitmap(uint32_t* bit_map, int32_t * bit_map_size, int32_t vertex_id) {
+	// We need atomics here because of bit manipulations
+	if (checkBit(bit_map, vertex_id)) 
+		return;
+	if (!setBit(bit_map, vertex_id))
+		atomicAggInc(bit_map_size);	
+}
+static void swap_queues(VertexFrontier &frontier) {
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	temp = frontier.d_sparse_queue_input;
+	frontier.d_sparse_queue_input = frontier.d_sparse_queue_output;
+	frontier.d_sparse_queue_output = temp;
+
+	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));	
+}
+
+static void __device__ swap_queues_device(VertexFrontier &frontier) {	
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	temp = frontier.d_sparse_queue_input;
+	frontier.d_sparse_queue_input = frontier.d_sparse_queue_output;
+	frontier.d_sparse_queue_output = temp;
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
+		frontier.d_num_elems_output[0] = 0;
+	this_grid().sync();
+}
+static void __device__ swap_queues_device_global(VertexFrontier &frontier) {	
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+		int32_t *temp = frontier.d_num_elems_input;
+		frontier.d_num_elems_input = frontier.d_num_elems_output;
+		frontier.d_num_elems_output = temp;
+		
+		temp = frontier.d_sparse_queue_input;
+		frontier.d_sparse_queue_input = frontier.d_sparse_queue_output;
+		frontier.d_sparse_queue_output = temp;
+	}
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
+		frontier.d_num_elems_output[0] = 0;
+	this_grid().sync();
+}
+
+static void swap_bytemaps(VertexFrontier &frontier) {
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	unsigned char* temp2;
+	temp2 = frontier.d_byte_map_input;
+	frontier.d_byte_map_input = frontier.d_byte_map_output;
+	frontier.d_byte_map_output = temp2;
+
+	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));	
+	cudaMemset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * frontier.max_num_elems);
+}
+
+static void __device__ swap_bytemaps_device(VertexFrontier &frontier) {
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	unsigned char* temp2;
+	temp2 = frontier.d_byte_map_input;
+	frontier.d_byte_map_input = frontier.d_byte_map_output;
+	frontier.d_byte_map_output = temp2;
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
+		frontier.d_num_elems_output[0] = 0;
+	parallel_memset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * frontier.max_num_elems);		
+	this_grid().sync();
+}
+static void __device__ swap_bytemaps_device_global(VertexFrontier &frontier) {
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+		int32_t *temp = frontier.d_num_elems_input;
+		frontier.d_num_elems_input = frontier.d_num_elems_output;
+		frontier.d_num_elems_output = temp;
+		
+		unsigned char* temp2;
+		temp2 = frontier.d_byte_map_input;
+		frontier.d_byte_map_input = frontier.d_byte_map_output;
+		frontier.d_byte_map_output = temp2;
+	}
+	this_grid().sync();
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
+		frontier.d_num_elems_output[0] = 0;
+	this_grid().sync();
+	parallel_memset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * frontier.max_num_elems);		
+	this_grid().sync();
+}
+static void swap_bitmaps(VertexFrontier &frontier) {
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	uint32_t* temp2;
+	temp2 = frontier.d_bit_map_input;
+	frontier.d_bit_map_input = frontier.d_bit_map_output;
+	frontier.d_bit_map_output = temp2;
+
+	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));		
+	int32_t num_byte_for_bitmap = (frontier.max_num_elems + 8 * sizeof(uint32_t) - 1)/(sizeof(uint32_t) * 8);
+	cudaMemset(frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);
+	cudaCheckLastError();
+}
+static void __device__ swap_bitmaps_device(VertexFrontier &frontier) {
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	uint32_t* temp2;
+	temp2 = frontier.d_bit_map_input;
+	frontier.d_bit_map_input = frontier.d_bit_map_output;
+	frontier.d_bit_map_output = temp2;
+
+	int32_t num_byte_for_bitmap = (frontier.max_num_elems + 8 * sizeof(uint32_t) - 1)/(sizeof(uint32_t) * 8);
+
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
+		frontier.d_num_elems_output[0] = 0;
+	parallel_memset((unsigned char*)frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);		
+	this_grid().sync();
+}
+static void __device__ swap_bitmaps_device_global(VertexFrontier &frontier) {
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+		int32_t *temp = frontier.d_num_elems_input;
+		frontier.d_num_elems_input = frontier.d_num_elems_output;
+		frontier.d_num_elems_output = temp;
+		
+		uint32_t* temp2;
+		temp2 = frontier.d_bit_map_input;
+		frontier.d_bit_map_input = frontier.d_bit_map_output;
+		frontier.d_bit_map_output = temp2;
+	}
+
+	int32_t num_byte_for_bitmap = (frontier.max_num_elems + 8 * sizeof(uint32_t) - 1)/(sizeof(uint32_t) * 8);
+
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
+		frontier.d_num_elems_output[0] = 0;
+	parallel_memset((unsigned char*)frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);		
+	this_grid().sync();
+}
+static void __device__ dedup_frontier_device(VertexFrontier &frontier) {
+	for(int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < frontier.d_num_elems_input[0]; vidx += blockDim.x * gridDim.x) {
+		int32_t vid = frontier.d_sparse_queue_input[vidx];
+		if (frontier.d_dedup_counters[vid] < frontier.curr_dedup_counter) {
+			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, vid);
+			frontier.d_dedup_counters[vid] = frontier.curr_dedup_counter;	
+		}
+	}
+}
+static void __global__ dedup_frontier_kernel(VertexFrontier frontier) {
+	dedup_frontier_device(frontier);	
+}
+static void dedup_frontier(VertexFrontier &frontier) {
+	frontier.curr_dedup_counter++;
+	dedup_frontier_kernel<<<NUM_CTA, CTA_SIZE>>>(frontier);
+	swap_queues(frontier);
+}
+
+static void __device__ dedup_frontier_device_perfect(VertexFrontier &frontier) {
+	for(int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < frontier.d_num_elems_input[0]; vidx += blockDim.x * gridDim.x) {
+		int32_t vid = frontier.d_sparse_queue_input[vidx];
+		if (writeMax(&frontier.d_dedup_counters[vid], frontier.curr_dedup_counter)) {
+			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, vid);
+		}
+	}
+}
+static void __global__ dedup_frontier_kernel_perfect(VertexFrontier frontier) {
+	dedup_frontier_device_perfect(frontier);	
+}
+static void dedup_frontier_perfect(VertexFrontier &frontier) {
+	frontier.curr_dedup_counter++;
+	dedup_frontier_kernel_perfect<<<NUM_CTA, CTA_SIZE>>>(frontier);
+	swap_queues(frontier);
+}
+bool __device__ true_function(int32_t _) {
+	return true;
+}
+template <bool to_func(int32_t)>
+static void __device__ vertex_set_create_reverse_sparse_queue(VertexFrontier &frontier) {
+	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
+		if ((to_func(node_id)))
+			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, node_id);
+	}	
+}
+template <bool to_func(int32_t)>
+static void __global__ vertex_set_create_reverse_sparse_queue_kernel(VertexFrontier frontier) {
+	vertex_set_create_reverse_sparse_queue<to_func>(frontier);
+}
+
+template <bool to_func(int32_t)>
+static void vertex_set_create_reverse_sparse_queue_host(VertexFrontier &frontier) {
+	vertex_set_create_reverse_sparse_queue_kernel<to_func><<<NUM_CTA, CTA_SIZE>>>(frontier);
+	swap_queues(frontier);	
+}
+
+template <bool to_func(int32_t)>
+static void __device__ vertex_set_create_reverse_sparse_queue_device(VertexFrontier &frontier) {
+	vertex_set_create_reverse_sparse_queue<to_func>(frontier);
+	this_grid().sync();
+	swap_queues_device(frontier);	
+}
+static void foo_bar(void) {
+}
+
+template <bool where_func(int32_t)>
+static void __global__ vertex_set_where_kernel(int32_t num_vertices, VertexFrontier frontier) {
+
+	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < num_vertices; node_id += blockDim.x * gridDim.x) {
+		if (where_func(node_id)) {
+			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, node_id);
+		}
+	}
+
+}
+
+template <bool where_func(int32_t)>
+static void __host__ vertex_set_where(int32_t num_vertices, VertexFrontier &frontier) {
+	vertex_set_where_kernel<where_func><<<NUM_CTA, CTA_SIZE>>>(num_vertices, frontier);
+	swap_queues(frontier);
+}
+
+}
+
+#endif
+
diff --git a/src/runtime_lib/infra_gpu/vertex_representation.h b/src/runtime_lib/infra_gpu/vertex_representation.h
new file mode 100755
index 00000000..8b4db24c
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/vertex_representation.h
@@ -0,0 +1,134 @@
+#ifndef VERTEX_REPRESENTATION_H
+#define VERTEX_REPRESENTATION_H
+
+#include "infra_gpu/vertex_frontier.h"
+#include <cooperative_groups.h>
+using namespace cooperative_groups;
+namespace gpu_runtime {
+template <typename AccessorType, bool condition(VertexFrontier&, int32_t), void update(VertexFrontier&, int32_t)>
+static void __device__ generalized_prepare_from_to(VertexFrontier &frontier) {
+	int32_t total_work = AccessorType::getSize(frontier);
+	for (int32_t index = threadIdx.x + blockIdx.x * blockDim.x; index < total_work; index += gridDim.x * blockDim.x) {
+		int32_t node_id = AccessorType::getElement(frontier, index);
+		if (condition(frontier, node_id))
+			update(frontier, node_id);	
+	}
+}
+
+template <typename AccessorType, bool condition(VertexFrontier&, int32_t), void update(VertexFrontier&, int32_t)>
+static void __global__ generalized_prepare_from_to_kernel(VertexFrontier frontier) {
+	generalized_prepare_from_to<AccessorType, condition, update>(frontier);
+}
+
+static bool __device__ condition_sparse(VertexFrontier &frontier, int32_t node_id) {
+	return true;
+}
+static bool __device__ condition_bytemap(VertexFrontier &frontier, int32_t node_id) {
+	return frontier.d_byte_map_input[node_id] == 1;	
+}
+static bool __device__ condition_bitmap(VertexFrontier &frontier, int32_t node_id) {
+	return checkBit(frontier.d_bit_map_input, node_id);
+}
+
+
+static void __device__ update_sparse(VertexFrontier &frontier, int32_t node_id) {
+	enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, node_id);
+} 
+
+static void __device__ update_bytemap(VertexFrontier &frontier, int32_t node_id) {
+	enqueueVertexBytemap(frontier.d_byte_map_output, frontier.d_num_elems_output, node_id);
+}
+
+static void __device__ update_bitmap(VertexFrontier &frontier, int32_t node_id) {
+	enqueueVertexBitmap(frontier.d_bit_map_output, frontier.d_num_elems_output, node_id);
+}
+
+static void vertex_set_prepare_sparse(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		generalized_prepare_from_to_kernel<AccessorAll, condition_bytemap, update_sparse><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_queues(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		generalized_prepare_from_to_kernel<AccessorAll, condition_bitmap, update_sparse><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_queues(frontier);
+		return;
+	}
+}
+static void __device__ vertex_set_prepare_sparse_device(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		generalized_prepare_from_to<AccessorAll, condition_bytemap, update_sparse>(frontier);
+		this_grid().sync();
+		swap_queues_device(frontier);
+		this_grid().sync();
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		generalized_prepare_from_to<AccessorAll, condition_bitmap, update_sparse>(frontier);
+		this_grid().sync();
+		swap_queues_device(frontier);
+		this_grid().sync();
+		return;
+	}
+}
+static void vertex_set_prepare_boolmap(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		generalized_prepare_from_to_kernel<AccessorSparse, condition_sparse, update_bytemap><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bytemaps(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		generalized_prepare_from_to_kernel<AccessorAll, condition_bitmap, update_bytemap><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bytemaps(frontier);
+		return;
+	}
+}
+static void __device__ vertex_set_prepare_boolmap_device(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		generalized_prepare_from_to<AccessorSparse, condition_sparse, update_bytemap>(frontier);
+		this_grid().sync();
+		swap_bytemaps_device(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		generalized_prepare_from_to<AccessorAll, condition_bitmap, update_bytemap>(frontier);
+		this_grid().sync();
+		swap_bytemaps_device(frontier);
+		return;
+	}
+}
+static void vertex_set_prepare_bitmap(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		generalized_prepare_from_to_kernel<AccessorSparse, condition_sparse, update_bitmap><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bitmaps(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		generalized_prepare_from_to_kernel<AccessorAll, condition_bytemap, update_bitmap><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bitmaps(frontier);
+		return;	
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		return;
+	}
+}
+static void __device__ vertex_set_prepare_bitmap_device(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		generalized_prepare_from_to<AccessorSparse, condition_sparse, update_bitmap>(frontier);
+		this_grid().sync();
+		swap_bitmaps_device(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		generalized_prepare_from_to<AccessorAll, condition_bytemap, update_bitmap>(frontier);
+		this_grid().sync();
+		swap_bitmaps_device(frontier);
+		return;	
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		return;
+	}
+
+}
+}
+#endif
diff --git a/src/runtime_lib/infra_ligra/ligra/byte-pd.h b/src/runtime_lib/infra_ligra/ligra/byte-pd.h
index d4340878..7bd124b9 100644
--- a/src/runtime_lib/infra_ligra/ligra/byte-pd.h
+++ b/src/runtime_lib/infra_ligra/ligra/byte-pd.h
@@ -254,13 +254,8 @@ long compressEdge(uchar *start, long curOffset, uintE e) {
   Returns:
     The new offset into the edge array
 */
-<<<<<<< HEAD
 static long sequentialCompressEdgeSet(uchar *edgeArray, long currentOffset, uintT degree, 
 			       uintE vertexNum, uintE *savedEdges) {
-=======
-long sequentialCompressEdgeSet(uchar *edgeArray, long currentOffset, uintT degree,
-                               uintE vertexNum, uintE *savedEdges) {
->>>>>>> a903707a446090b4d992269fbbbd22c099f06b28
   if (degree > 0) {
     long startOffset = currentOffset;
     long numChunks = 1+(degree-1)/PARALLEL_DEGREE;
diff --git a/src/runtime_lib/intrinsics.h b/src/runtime_lib/intrinsics.h
index c6841b93..09a7419e 100644
--- a/src/runtime_lib/intrinsics.h
+++ b/src/runtime_lib/intrinsics.h
@@ -493,21 +493,7 @@ template <typename T> T static builtin_pop (std::vector<T>* vec){
 //    return (float)(usec.time_since_epoch().count())/1000;
 //}
 
-static struct timeval start_time_;
-static struct timeval elapsed_time_;
-
-static void startTimer(){
-    gettimeofday(&start_time_, NULL);
-}
-
-static float stopTimer(){
-    gettimeofday(&elapsed_time_, NULL);
-    elapsed_time_.tv_sec  -= start_time_.tv_sec;
-    elapsed_time_.tv_usec -= start_time_.tv_usec;
-    return elapsed_time_.tv_sec + elapsed_time_.tv_usec/1e6;
-
-}
-
+#include "graphit_timer.h"
 
 static char* argv_safe(int index, char** argv, int argc ){
     // if index is less than or equal to argc than return argv[index]
@@ -678,4 +664,7 @@ void updateBucketWithGraphItVertexSubset(VertexSubset<NodeID>* vset, julienne::P
 }
 
 
+
+
+
 #endif //GRAPHIT_INTRINSICS_H_H
diff --git a/test/c++/high_level_schedule_test.cpp b/test/c++/high_level_schedule_test.cpp
index 7862b23c..67966f2d 100644
--- a/test/c++/high_level_schedule_test.cpp
+++ b/test/c++/high_level_schedule_test.cpp
@@ -43,6 +43,29 @@ class HighLevelScheduleTest : public ::testing::Test {
                                 "  print \"finished running BFS\"; \n"
                                 "end");
 
+        const char* bfs_char_gpu = ("element Vertex end\n"
+                                "element Edge end\n"
+                                "const edges : edgeset{Edge}(Vertex,Vertex) = load (\"../../test/graphs/test.el\");\n"
+                                "const vertices : vertexset{Vertex} = edges.getVertices();\n"
+                                "const parent : vector{Vertex}(int) = -1;\n"
+                                "func updateEdge(src : Vertex, dst : Vertex) "
+                                "  parent[dst] = src; "
+                                "end\n"
+                                "func toFilter(v : Vertex) -> output : bool "
+                                "  output = parent[v] == -1; "
+                                "end\n"
+                                "func main() "
+                                "  var frontier : vertexset{Vertex} = new vertexset{Vertex}(0); "
+                                "  frontier.addVertex(1); "
+                                "  while (frontier.getVertexSetSize() != 0) "
+                                "      #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge, parent, true); \n"
+				"      delete frontier;\n"
+				"      frontier = output;\n"	
+                                "  end\n"
+                                "  print \"finished running BFS\"; \n"
+                                "end");
+
+
 
         const char*  pr_char = ("element Vertex end\n"
                                              "element Edge end\n"
@@ -129,6 +152,32 @@ class HighLevelScheduleTest : public ::testing::Test {
                                                          "     end\n"
                                                          "end";
 
+        const char * sssp_char_gpu =      "element Vertex end\n"
+                                                         "element Edge end\n"
+                                                         "const edges : edgeset{Edge}(Vertex,Vertex, int) = load (\"../test/graphs/test.wel\");\n"
+                                                         "const vertices : vertexset{Vertex} = edges.getVertices();\n"
+                                                         "const SP : vector{Vertex}(int) = 2147483647; %should be INT_MAX \n"
+                                                         "func updateEdge(src : Vertex, dst : Vertex, weight : int) -> output : bool\n"
+                                                         "    SP[dst] min= (SP[src] + weight);\n"
+                                                         "end\n"
+                                                         "func main() \n"
+                                                         "    var n : int = edges.getVertices();\n"
+                                                         "    var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);\n"
+                                                         "    frontier.addVertex(0); %add source vertex \n"
+                                                         "    SP[0] = 0;\n"
+                                                         "    var rounds : int = 0;\n"
+                                                         "    while (frontier.getVertexSetSize() != 0)\n"
+                                                         "         #s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge, SP);\n"
+							 "         delete frontier;\n"
+					                 "         frontier = output;\n"
+                                                         "         rounds = rounds + 1;\n"
+                                                         "         if rounds == n\n"
+                                                         "             print \"negative cycle\";\n"
+                                                         "          end\n"
+                                                         "     end\n"
+                                                         "end";
+
+
         const char * sssp_async_char =      "element Vertex end\n"
                 "element Edge end\n"
                 "const edges : edgeset{Edge}(Vertex,Vertex, int) = load (\"../test/graphs/test.wel\");\n"
@@ -373,13 +422,13 @@ class HighLevelScheduleTest : public ::testing::Test {
                                                               "        end\n"
                                                               "\n"
                                                               "    end\n"
-                                                              "end");	
-	
+                                                              "end");
+
         const char*  pr_cc_char = ("element Vertex end\n"
                                              "element Edge end\n"
                                              "const edges : edgeset{Edge}(Vertex,Vertex) = load (\"test.el\");\n"
                                              "const vertices : vertexset{Vertex} = edges.getVertices();\n"
-                                             "const IDs : vector{Vertex}(int) = 1;\n"				   
+                                             "const IDs : vector{Vertex}(int) = 1;\n"
                                              "const old_rank : vector{Vertex}(float) = 1.0;\n"
                                              "const new_rank : vector{Vertex}(float) = 0.0;\n"
                                              "const out_degrees : vector{Vertex}(int) = edges.getOutDegrees();\n"
@@ -912,8 +961,10 @@ class HighLevelScheduleTest : public ::testing::Test {
                                                "end\n");
 
         bfs_str_ =  string (bfs_char);
+        bfs_str_gpu_ =  string (bfs_char_gpu);
         pr_str_ = string(pr_char);
         sssp_str_ = string  (sssp_char);
+        sssp_str_gpu_ = string  (sssp_char_gpu);
         sssp_async_str_ = string (sssp_async_char);
         cf_str_ = string  (cf_char);
         cc_str_ = string  (cc_char);
@@ -992,14 +1043,30 @@ class HighLevelScheduleTest : public ::testing::Test {
         return be->emitCPP();
     }
 
+
+    int basicTestWithGPUSchedule(
+            fir::high_level_schedule::ProgramScheduleNode::Ptr program) {
+
+        graphit::Midend *me = new graphit::Midend(context_, program->getSchedule());
+        std::cout << "fir: " << std::endl;
+        std::cout << *(context_->getProgram());
+        std::cout << std::endl;
+
+        me->emitMIR(mir_context_);
+        graphit::Backend *be = new graphit::Backend(mir_context_);
+        return be->emitGPU();
+    }
+
     std::vector<ParseError> *errors_;
     graphit::FIRContext *context_;
     Frontend *fe_;
     graphit::MIRContext *mir_context_;
 
     string bfs_str_;
+    string bfs_str_gpu_;
     string pr_str_;
     string sssp_str_;
+    string sssp_str_gpu_;
     string sssp_async_str_;
     string cf_str_;
     string cc_str_;
@@ -2593,13 +2660,13 @@ TEST_F(HighLevelScheduleTest, UnorderedKCoreSparsePushParallel){
 }
 
 TEST_F(HighLevelScheduleTest, UnorderedKCoreSparsePushDensePullParallel){
-istringstream is (unordered_kcore_str_);
-fe_->parseStream(is, context_, errors_);
-fir::high_level_schedule::ProgramScheduleNode::Ptr program
+    istringstream is (unordered_kcore_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
         = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-program->configApplyDirection("s1", "SparsePush-DensePull");
-program->configApplyParallelization("s1", "dynamic-vertex-parallel");
-EXPECT_EQ (0, basicTestWithSchedule(program));
+    program->configApplyDirection("s1", "SparsePush-DensePull");
+    program->configApplyParallelization("s1", "dynamic-vertex-parallel");
+    EXPECT_EQ (0, basicTestWithSchedule(program));
 }
 
 TEST_F(HighLevelScheduleTest, KCoreSumReduceBeforeUpdate){
@@ -2659,23 +2726,23 @@ EXPECT_EQ (0, basicTestWithSchedule(program));
 }
 
 TEST_F(HighLevelScheduleTest, KCoreDensePullParallel){
-istringstream is (kcore_str_);
-fe_->parseStream(is, context_, errors_);
-fir::high_level_schedule::ProgramScheduleNode::Ptr program
-        = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-program->configApplyDirection("s1", "DensePull");
-program->configApplyParallelization("s1", "dynamic-vertex-parallel");
-EXPECT_EQ (0, basicTestWithSchedule(program));
+    istringstream is (kcore_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    program->configApplyDirection("s1", "DensePull");
+    program->configApplyParallelization("s1", "dynamic-vertex-parallel");
+    EXPECT_EQ (0, basicTestWithSchedule(program));
 }
 
 TEST_F(HighLevelScheduleTest, KCoreSparsePushDensePullParallel){
-istringstream is (kcore_str_);
-fe_->parseStream(is, context_, errors_);
-fir::high_level_schedule::ProgramScheduleNode::Ptr program
-        = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-program->configApplyDirection("s1", "SparsePush-DensePull");
-program->configApplyParallelization("s1", "dynamic-vertex-parallel");
-EXPECT_EQ (0, basicTestWithSchedule(program));
+    istringstream is (kcore_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    program->configApplyDirection("s1", "SparsePush-DensePull");
+    program->configApplyParallelization("s1", "dynamic-vertex-parallel");
+    EXPECT_EQ (0, basicTestWithSchedule(program));
 }
 
 TEST_F(HighLevelScheduleTest, ParForSimpleSchedule){
@@ -2708,15 +2775,84 @@ TEST_F(HighLevelScheduleTest, ParForNestedSchedule){
 }
 
 TEST_F(HighLevelScheduleTest, SetCoverUintDefaultSchedule){
-istringstream is (setcover_uint_str_);
-fe_->parseStream(is, context_, errors_);
-fir::high_level_schedule::ProgramScheduleNode::Ptr program
+    istringstream is (setcover_uint_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
         = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-EXPECT_EQ (0, basicTestWithSchedule(program));
+    EXPECT_EQ (0, basicTestWithSchedule(program));
 }
 
+TEST_F(HighLevelScheduleTest, BFSBasicSimpleGPUScheduleTest) {
+    istringstream is (bfs_str_gpu_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    // Now apply the GPU Schedule
+    fir::gpu_schedule::SimpleGPUSchedule s1;
+    s1.configDeduplication(fir::gpu_schedule::DISABLED);
+    s1.configDirection(fir::gpu_schedule::PUSH);
+    program->applyGPUSchedule("s1", s1);
+    EXPECT_EQ (0, basicTestWithGPUSchedule(program));
+}
 
+TEST_F(HighLevelScheduleTest, BFSBasicHybridGPUScheduleTest) {
+    istringstream is (bfs_str_gpu_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    // Now apply the GPU Schedule
+    fir::gpu_schedule::SimpleGPUSchedule s1;
+    fir::gpu_schedule::SimpleGPUSchedule s2;
+    s1.configDeduplication(fir::gpu_schedule::DISABLED);
+    s1.configDirection(fir::gpu_schedule::PUSH);
+    s2 = s1;
+    s2.configDirection(fir::gpu_schedule::PULL);
+    fir::gpu_schedule::HybridGPUSchedule h1 (fir::gpu_schedule::INPUT_VERTEXSET_SIZE, 0.2, s1, s2);
+    program->applyGPUSchedule("s1", h1);
+    EXPECT_EQ (0, basicTestWithGPUSchedule(program));
+}
 
+TEST_F(HighLevelScheduleTest, SSSP_LabelProp_GPUScheduleTest) {
+    istringstream is (sssp_str_gpu_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+        = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    fir::gpu_schedule::SimpleGPUSchedule s1;
+    s1.configDeduplication(fir::gpu_schedule::ENABLED);
+    s1.configDirection(fir::gpu_schedule::PUSH);
+    program->applyGPUSchedule("s1", s1);
+    EXPECT_EQ (0, basicTestWithGPUSchedule(program));
+}
 
+TEST_F(HighLevelScheduleTest, DeltaSteppingWithDefaultGPUSchedule) {
+    istringstream is (delta_stepping_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+        = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    fir::gpu_schedule::SimpleGPUSchedule s1;
+    program->applyGPUSchedule("s1", s1);
+    EXPECT_EQ (0, basicTestWithGPUSchedule(program));
+}
 
+TEST_F(HighLevelScheduleTest, BFSHybridPushPullScheduleTest) {
+    using namespace fir::gpu_schedule;
 
+    istringstream is (bfs_str_gpu_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    // Now apply the GPU Schedule
+    SimpleGPUSchedule s1;
+    s1.configDeduplication(ENABLED);
+    s1.configFrontierCreation(UNFUSED_BITMAP);
+    s1.configLoadBalance(TWCE);
+    s1.configDirection(PUSH);
+    
+    SimpleGPUSchedule s2 = s1;
+    s2.configLoadBalance(VERTEX_BASED);
+    s2.configDirection(PULL, BITMAP);
+    
+    HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, 0.12, s1, s2);
+    program->applyGPUSchedule("s1", h1);
+    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
+}
diff --git a/test/c++/midend_test.cpp b/test/c++/midend_test.cpp
index 381b28b4..1b9974ba 100644
--- a/test/c++/midend_test.cpp
+++ b/test/c++/midend_test.cpp
@@ -121,6 +121,77 @@ TEST_F(MidendTest, SimpleVertexSetDeclAllocWithMain) {
     EXPECT_EQ (0, basicTest(is));
 }
 
+// Test cases for the MIRMetadata API
+TEST_F(MidendTest, SimpleMetadataTest) {
+    istringstream is("func main() print 4; end");
+    EXPECT_EQ(0, basicTest(is));
+    EXPECT_EQ(true, mir_context_->isFunction("main"));
+
+    mir::FuncDecl::Ptr main_func = mir_context_->getFunction("main");
+
+    main_func->setMetadata<bool>("basic_boolean_md", true);
+    main_func->setMetadata<int>("basic_int_md", 42);
+    EXPECT_EQ(true, main_func->hasMetadata<bool>("basic_boolean_md"));
+    EXPECT_EQ(true, main_func->getMetadata<bool>("basic_boolean_md"));
+    
+    EXPECT_EQ(true, main_func->hasMetadata<int>("basic_int_md"));
+    EXPECT_EQ(42, main_func->getMetadata<int>("basic_int_md"));
+
+}
+TEST_F(MidendTest, SimpleMetadataTestNoExist) {
+    istringstream is("func main() print 4; end");
+    EXPECT_EQ(0, basicTest(is));
+    EXPECT_EQ(true, mir_context_->isFunction("main"));
+
+    mir::FuncDecl::Ptr main_func = mir_context_->getFunction("main");
+
+    main_func->setMetadata<int>("basic_int_md", 42);
+    EXPECT_EQ(false, main_func->hasMetadata<int>("other_int_md"));
+    EXPECT_EQ(false, main_func->hasMetadata<bool>("basic_int_md")); 
+}
+
+TEST_F(MidendTest, SimpleMetadataTestString) {
+    istringstream is("func main() print 4; end");
+    EXPECT_EQ(0, basicTest(is));
+    EXPECT_EQ(true, mir_context_->isFunction("main"));
+
+    mir::FuncDecl::Ptr main_func = mir_context_->getFunction("main");
+
+    main_func->setMetadata<std::string>("basic_str_md", "md value");
+    EXPECT_EQ(true, main_func->hasMetadata<std::string>("basic_str_md"));
+    EXPECT_EQ("md value", main_func->getMetadata<std::string>("basic_str_md"));
+}
+
+TEST_F(MidendTest, SimpleMetadataTestMIRNodeAsMD) {
+    istringstream is("const val:int = 42;\nfunc main() print val; end");
+    EXPECT_EQ(0, basicTest(is));
+    EXPECT_EQ(true, mir_context_->isFunction("main"));
+    EXPECT_EQ(1, mir_context_->getConstants().size());
+     
+    mir::FuncDecl::Ptr main_func = mir_context_->getFunction("main");
+    mir::VarDecl::Ptr decl = mir_context_->getConstants()[0];
+
+    main_func->setMetadata<mir::MIRNode::Ptr>("used_var_md", decl);
+    
+    EXPECT_EQ(true, main_func->hasMetadata<mir::MIRNode::Ptr>("used_var_md"));
+    mir::MIRNode::Ptr mdnode = main_func->getMetadata<mir::MIRNode::Ptr>("used_var_md");
+    EXPECT_EQ(true, mir::isa<mir::VarDecl>(mdnode)); 
+}
+
+TEST_F(MidendTest, SimpleMetadataTestMIRNodeVectorAsMD) {
+    istringstream is("const val:int = 42;\nconst val2: int = 55;\nfunc main() print val + val2; end");
+    EXPECT_EQ(0, basicTest(is));
+    EXPECT_EQ(true, mir_context_->isFunction("main"));
+    EXPECT_EQ(2, mir_context_->getConstants().size());
+     
+    mir::FuncDecl::Ptr main_func = mir_context_->getFunction("main");
+    std::vector<mir::VarDecl::Ptr> decls = mir_context_->getConstants();
+
+    main_func->setMetadata<std::vector<mir::VarDecl::Ptr>>("used_vars_md", decls);
+    
+    EXPECT_EQ(true, main_func->hasMetadata<std::vector<mir::VarDecl::Ptr>>("used_vars_md"));
+    EXPECT_EQ(2, main_func->getMetadata<std::vector<mir::VarDecl::Ptr>>("used_vars_md").size());
+}
 TEST_F(MidendTest, SimpleIntersectionOperator) {
     istringstream is("element Vertex end\n"
                      "element Edge end\n"
diff --git a/test/c++/test.cpp b/test/c++/test.cpp
index 52c74b56..a639ff80 100644
--- a/test/c++/test.cpp
+++ b/test/c++/test.cpp
@@ -56,17 +56,18 @@ int main(int argc, char **argv) {
 //
 //    ::testing::GTEST_FLAG(filter) = "LowLevelScheduleTest.SimpleApplyFunctionFusion";
 
-//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.UnorderedKCoreSparsePushDensePullParallel";
+//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.DeltaSteppingWithDefaultGPUSchedule";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.UnorderedKCoreSparsePushParallel";
 
 
-//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.KCoreSparsePushParallel";
+//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.DeltaSteppingDensePullParallel";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.KCoreSparsePushSerial";
 //
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.KCoreSumReduceBeforeUpdate";
 
 
 
+//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.SSSP_LabelProp_GPUScheduleTest";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.AStarDeltaSteppingWithEagerPriorityUpdateWithMergeArgv";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.KCoreSumReduceBeforeUpdate";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.DeltaSteppingWithDefaultSchedule";
@@ -93,6 +94,7 @@ int main(int argc, char **argv) {
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.PRCCPullParallelDifferentSegments";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.PRPullParallelNumaAware";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.PRPullParallelNumaAware";
+//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.BFSBasicSimpleGPUScheduleTest";
 
 
     return RUN_ALL_TESTS();
diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
new file mode 100644
index 00000000..8266df53
--- /dev/null
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -0,0 +1,200 @@
+import unittest
+import subprocess
+import os
+import shutil
+import sys
+
+GRAPHIT_BUILD_DIRECTORY="${GRAPHIT_BUILD_DIRECTORY}".strip().rstrip("/")
+GRAPHIT_SOURCE_DIRECTORY="${GRAPHIT_SOURCE_DIRECTORY}".strip().rstrip("/")
+CXX_COMPILER="${CXX_COMPILER}"
+
+NVCC_COMPILER="${NVCC_COMPILER}"
+
+class TestGPURuntimeLibrary(unittest.TestCase):
+	@classmethod
+	def get_command_output_class(self, command):
+		output = ""
+		if isinstance(command, list):
+			proc = subprocess.Popen(command, stdout=subprocess.PIPE)
+		else:
+			print(command)
+			proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
+		exitcode = proc.wait()
+		for line in proc.stdout.readlines():
+			if isinstance(line, bytes):
+				line = line.decode()
+			output += line.rstrip() + "\n"
+
+		proc.stdout.close()
+		return exitcode, output
+
+	def get_command_output(self, command):
+		(exitcode, output) = self.get_command_output_class(command)
+		self.assertEqual(exitcode, 0)
+		return output
+
+	def sssp_verified_test(self, input_file_name, use_delta=False):
+		self.cpp_compile_test(input_file_name, [])
+		if use_delta:
+			#start point 0, delta 10, verified
+			self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 0 10 v > " + self.verifier_input)
+		else:
+			self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 0 v > " + self.verifier_input)	     
+		output = self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t " + self.verifier_input + "  -r 0")		
+		test_flag = False
+		for line in output.rstrip().split("\n"):
+			if line.rstrip().find("SUCCESSFUL") != -1:
+				test_flag = True
+				break;
+		self.assertEqual(test_flag, True)
+		
+	@classmethod	
+	def setUpClass(cls):
+		if NVCC_COMPILER == "CUDA_NVCC_EXECUTABLE-NOTFOUND":
+			print ("Cannot find CUDA compiler")
+			exit(-1)	
+
+		cls.build_directory = GRAPHIT_BUILD_DIRECTORY
+		cls.scratch_directory = GRAPHIT_BUILD_DIRECTORY + "/scratch"
+		cls.verifier_directory = cls.build_directory + "/bin"	
+		if os.path.isdir(cls.scratch_directory):
+			shutil.rmtree(cls.scratch_directory)
+		os.mkdir(cls.scratch_directory)
+		
+		cls.nvcc_command = NVCC_COMPILER + " -ccbin " + CXX_COMPILER + " "
+		cls.test_input_directory = GRAPHIT_SOURCE_DIRECTORY + "/test/gpu_tests/test_input"
+		
+		cls.get_command_output_class(cls.nvcc_command + cls.test_input_directory + "/obtain_gpu_cc.cu -o " + cls.scratch_directory + "/obtain_gpu_cc")
+		output = cls.get_command_output_class(cls.scratch_directory + "/obtain_gpu_cc")[1].split()
+
+		if len(output) != 2:
+			print ("Cannot obtain GPU information")
+			exit(-1)
+		compute_capability = output[0]
+		num_of_sm = output[1]
+		
+		cls.nvcc_command += " -rdc=true -DNUM_CTA=" + str(int(num_of_sm)*2) + " -DCTA_SIZE=512 -gencode arch=compute_" + compute_capability + ",code=sm_" + compute_capability
+		cls.nvcc_command += " -std=c++11 -O3 -I " + GRAPHIT_SOURCE_DIRECTORY + "/src/runtime_lib/ -Xcompiler \"-w\" -Wno-deprecated-gpu-targets --use_fast_math -Xptxas \" -dlcm=ca --maxrregcount=64\" "
+		
+		shutil.copytree(GRAPHIT_SOURCE_DIRECTORY + "/test/graphs", cls.scratch_directory + "/graphs")
+		cls.graph_directory = cls.scratch_directory + "/graphs"
+		cls.executable_name = cls.scratch_directory + "/test_executable"	
+		cls.cuda_filename = cls.scratch_directory + "/test_cpp.cu"
+		
+		cls.graphitc_py = GRAPHIT_BUILD_DIRECTORY + "/bin/graphitc.py"
+		cls.verifier_input = cls.scratch_directory + "/verifier_input"
+
+	def cpp_compile_test(self, input_file_name, extra_cpp_args=[]):
+		if input_file_name[0] == "/":
+			compile_command = self.nvcc_command + input_file_name + " -o " + self.executable_name + " " + " ".join(extra_cpp_args)
+		else:
+			compile_command = self.nvcc_command + self.test_input_directory + "/" + input_file_name + " -o " + self.executable_name + " " + " ".join(extra_cpp_args)
+		self.get_command_output(compile_command)
+	
+	def cpp_exec_test(self, input_file_name, extra_cpp_args=[], extra_exec_args=[]):
+		self.cpp_compile_test(input_file_name, extra_cpp_args)
+		return self.get_command_output(self.executable_name + " " + " ".join(extra_exec_args))
+
+	def graphit_generate_test(self, input_file_name, input_schedule_name=""):
+		if input_file_name[0] != "/":
+			input_file_name = self.test_input_directory + "/" + input_file_name
+		if input_schedule_name != "" and input_schedule_name[0] != "/":
+			input_schedule_name = self.test_input_directory + "/" + input_schedule_name
+
+		if input_schedule_name != "":
+			self.get_command_output("python " + self.graphitc_py + " -a " + input_file_name + " -f " + input_schedule_name + " -o " + self.cuda_filename)
+		else:
+			self.get_command_output("python " + self.graphitc_py + " -f " + input_file_name + " -o " + self.cuda_filename)
+		
+	def graphit_compile_test(self, input_file_name, input_schedule_name="", extra_cpp_args=[]):	
+		self.graphit_generate_test(input_file_name, input_schedule_name)
+		self.cpp_compile_test(self.cuda_filename, extra_cpp_args)
+
+	def graphit_exec_test(self, input_file_name, input_schedule_name="", extra_cpp_args=[], extra_exec_args=[]):
+		self.graphit_generate_test(input_file_name, input_schedule_name)
+		return self.cpp_exec_test(self.cuda_filename, extra_cpp_args, extra_exec_args)
+			
+	def test_basic_compile(self):
+		self.cpp_compile_test("basic_compile.cu")
+	def test_basic_load_graph(self):
+		output = self.cpp_exec_test("basic_load_graph.cu", [], [self.graph_directory + "/simple_mtx.mtx"])
+		output = output.split("\n")
+		self.assertEqual(len(output), 2)
+		self.assertEqual(output[0], "14, 106")
+	def test_runtime_library(self):
+		print (self.cpp_exec_test("runtime_lib_tests.cu", ["-I", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest/gtest-all.cc"], [self.graph_directory]))
+		
+	def test_sssp_lp_runtime_lib(self):
+		self.cpp_exec_test("sssp_lp.cu", [], [self.graph_directory + "/simple_mtx.mtx", "v"])
+
+	def test_sssp_lp_verified(self):
+		self.sssp_verified_test("sssp_lp.cu")
+		
+	def test_sssp_delta_stepping(self):
+		self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/simple_mtx.mtx", "0", "10",  "v"])
+
+	def test_sssp_delta_stepping_verified(self):
+		self.sssp_verified_test("sssp_delta_stepping.cu", True)
+
+	def test_sssp_delta_stepping_verified_frontier_byval(self):
+		self.sssp_verified_test("sssp_delta_stepping_frontier_byval.cu", True)
+
+	def test_simple_graphit_exec(self):
+		output = self.graphit_exec_test("simple_graph_load.gt", "default_gpu_schedule.gt", [], [self.graph_directory + "/simple_mtx.mtx"])
+		output = output.split("\n")
+		self.assertEqual(len(output), 2)
+		self.assertEqual(output[0], "14")
+
+	def test_simple_graphit_sssp_basic_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_default_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_TWCE_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_TWCE_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_TWC_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_TWC_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_CM_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_CM_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_WM_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_WM_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+		
+	def test_simple_graphit_sssp_strict_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_strict_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_vertex_based_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_vertex_based_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_TWC_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_TWC_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_TWCE_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_TWCE_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_CM_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_CM_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_WM_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_WM_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_strict_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_strict_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+if __name__ == '__main__':
+	unittest.main()
+	#suite = unittest.TestSuite()
+	#suite.addTest(TestGraphitCompiler('test_sssp_delta_stepping'))
+	#unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/test/gpu_tests/test_input/basic_compile.cu b/test/gpu_tests/test_input/basic_compile.cu
new file mode 100644
index 00000000..f1b3dd0c
--- /dev/null
+++ b/test/gpu_tests/test_input/basic_compile.cu
@@ -0,0 +1,5 @@
+#include "gpu_intrinsics.h"
+
+int __host__ main(int argc, char* argv[]) {
+	return 0;
+}
diff --git a/test/gpu_tests/test_input/basic_load_graph.cu b/test/gpu_tests/test_input/basic_load_graph.cu
new file mode 100644
index 00000000..4cd0a330
--- /dev/null
+++ b/test/gpu_tests/test_input/basic_load_graph.cu
@@ -0,0 +1,9 @@
+#include "gpu_intrinsics.h"
+
+gpu_runtime::GraphT<int32_t> edges;
+
+int __host__ main(int argc, char* argv[]) {
+	gpu_runtime::load_graph(edges, argv[1], false);
+	std::cout << edges.num_vertices << ", " << edges.num_edges << std::endl;	
+	return 0;
+}
diff --git a/test/gpu_tests/test_input/default_gpu_schedule.gt b/test/gpu_tests/test_input/default_gpu_schedule.gt
new file mode 100644
index 00000000..ae99ff2b
--- /dev/null
+++ b/test/gpu_tests/test_input/default_gpu_schedule.gt
@@ -0,0 +1,3 @@
+schedule:
+	SimpleGPUSchedule s1;
+	program->applyGPUSchedule("s1", s1);
diff --git a/test/gpu_tests/test_input/inputs/sssp.gt b/test/gpu_tests/test_input/inputs/sssp.gt
new file mode 100644
index 00000000..06cbe777
--- /dev/null
+++ b/test/gpu_tests/test_input/inputs/sssp.gt
@@ -0,0 +1,40 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const SP : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+     SP[dst] min= (SP[src] + weight);
+end
+
+func reset(v: Vertex)
+    SP[v] = 2147483647;
+end
+
+func main()
+    vertices.apply(reset);
+    var n : int = edges.getVertices();
+    var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+    var start_vertex : int = atoi(argv[2]);
+    frontier.addVertex(start_vertex); %add source vertex
+    SP[start_vertex] = 0;
+    var rounds : int = 0;
+    #s0# while (frontier.getVertexSetSize() != 0)
+        #s1# var output: vertexset{Vertex}  = edges.from(frontier).applyModified(updateEdge, SP);
+        delete frontier;
+        frontier = output;
+        rounds = rounds + 1;
+        if rounds == n
+                break;
+        end
+    end
+    delete frontier;
+    for vid in 0:n
+        print SP[vid];
+    end
+end
+
+
+
diff --git a/test/gpu_tests/test_input/obtain_gpu_cc.cu b/test/gpu_tests/test_input/obtain_gpu_cc.cu
new file mode 100644
index 00000000..bdec4266
--- /dev/null
+++ b/test/gpu_tests/test_input/obtain_gpu_cc.cu
@@ -0,0 +1,31 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_runtime_api.h>
+
+int main(int argc, char *argv[]) {
+    cudaDeviceProp prop;
+    cudaError_t status;
+    int device_count;
+    int device_index = 0;
+    if (argc > 1) {
+        device_index = atoi(argv[1]);
+    }
+
+    status = cudaGetDeviceCount(&device_count);
+    if (status != cudaSuccess) {
+        fprintf(stderr,"cudaGetDeviceCount() failed: %s\n", cudaGetErrorString(status));
+        return -1;
+    }
+    if (device_index >= device_count) {
+        fprintf(stderr, "Specified device index %d exceeds the maximum (the device count on this system is %d)\n", device_index, device_count);
+        return -1;
+    }
+    status = cudaGetDeviceProperties(&prop, device_index);
+    if (status != cudaSuccess) {
+        fprintf(stderr,"cudaGetDeviceProperties() for device device_index failed: %s\n", cudaGetErrorString(status));
+        return -1;
+    }
+    int v = prop.major * 10 + prop.minor;
+    printf("%d\n", v);
+    printf("%d\n", prop.multiProcessorCount);
+}
diff --git a/test/gpu_tests/test_input/runtime_lib_tests.cu b/test/gpu_tests/test_input/runtime_lib_tests.cu
new file mode 100644
index 00000000..c6f0d893
--- /dev/null
+++ b/test/gpu_tests/test_input/runtime_lib_tests.cu
@@ -0,0 +1,98 @@
+#include <gtest.h>
+#define NUM_BLOCKS (80)
+#include "gpu_intrinsics.h"
+
+std::string graph_directory;
+
+class GPURuntimeLibTest: public ::testing::Test {
+protected:
+	virtual void SetUp() {
+	}
+	virtual void TearDown() {
+	}
+	
+};
+TEST_F(GPURuntimeLibTest, SimpleLoadGraphFromFileTest) {
+	gpu_runtime::GraphT<int32_t> edges;
+	gpu_runtime::load_graph(edges, graph_directory + "/simple_mtx.mtx", false);
+	EXPECT_EQ (14, edges.num_vertices);
+}
+
+TEST_F(GPURuntimeLibTest, SimplePriorityQueueTest){
+	gpu_runtime::GraphT<int32_t> edges;
+	gpu_runtime::load_graph(edges, graph_directory + "/simple_mtx.mtx", false);
+	int num_vertices = gpu_runtime::builtin_getVertices(edges);
+	int* priorities = new int[num_vertices]; 
+	gpu_runtime::GPUPriorityQueue<int> pq;
+	EXPECT_EQ (14, num_vertices);
+}
+
+__device__ int32_t* test_array_1;
+void __device__ vertex_set_apply_all_test_function(int32_t vid) {
+	test_array_1[vid] += 1;
+}
+
+TEST_F(GPURuntimeLibTest, VertexSetApplyAllTest) {
+	gpu_runtime::GraphT<int32_t> edges;
+	gpu_runtime::load_graph(edges, graph_directory + "/simple_mtx.mtx", false);
+	int num_vertices = gpu_runtime::builtin_getVertices(edges);
+	EXPECT_EQ (14, num_vertices);
+	
+	int32_t *test_array;	
+	cudaMalloc(&test_array, num_vertices * sizeof(int32_t));
+	cudaMemcpyToSymbol(test_array_1, &test_array, sizeof(int32_t*), 0);
+	
+	int32_t *test_array_host = new int32_t[num_vertices];
+	cudaMemset(test_array, 0, sizeof(int32_t) * num_vertices);	
+
+	gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, vertex_set_apply_all_test_function><<<NUM_CTA, CTA_SIZE>>>(edges.getFullFrontier());
+	
+	cudaMemcpy(test_array_host, test_array, sizeof(int32_t) * num_vertices, cudaMemcpyDeviceToHost);
+	cudaFree(test_array);
+	for (int32_t index = 0; index < num_vertices; index++) {
+		EXPECT_EQ(1, test_array_host[index]);
+	}	
+}
+
+
+TEST_F(GPURuntimeLibTest, VertexSetApplySparseTest) {
+	gpu_runtime::GraphT<int32_t> edges;
+	gpu_runtime::load_graph(edges, graph_directory + "/simple_mtx.mtx", false);
+	int num_vertices = gpu_runtime::builtin_getVertices(edges);
+	EXPECT_EQ (14, num_vertices);
+	
+	int32_t *test_array;	
+	cudaMalloc(&test_array, num_vertices * sizeof(int32_t));
+	cudaMemcpyToSymbol(test_array_1, &test_array, sizeof(int32_t*), 0);
+	
+	int32_t *test_array_host = new int32_t[num_vertices];
+	cudaMemset(test_array, 0, sizeof(int32_t) * num_vertices);	
+
+	gpu_runtime::VertexFrontier frontier = gpu_runtime::create_new_vertex_set(num_vertices);
+
+	builtin_addVertex(frontier, 0);
+	builtin_addVertex(frontier, 7);
+	builtin_addVertex(frontier, 13);
+
+	
+	gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorSparse, vertex_set_apply_all_test_function><<<NUM_CTA, CTA_SIZE>>>(frontier);
+	
+	cudaMemcpy(test_array_host, test_array, sizeof(int32_t) * num_vertices, cudaMemcpyDeviceToHost);
+	cudaFree(test_array);
+	for (int32_t index = 0; index < num_vertices; index++) {
+		if (index == 0 || index == 7 || index == 13) 
+			EXPECT_EQ(1, test_array_host[index]);
+		else 
+			EXPECT_EQ(0, test_array_host[index]);
+	}	
+}
+
+int main(int argc, char* argv[]) {
+	if (argc < 2) {
+		std::cout << "Test needs path to graph directory as first argument" << std::endl;
+		exit(-1);
+	}
+	graph_directory = argv[1];
+	::testing::InitGoogleTest(&argc, argv);
+	return RUN_ALL_TESTS();	
+}
diff --git a/test/gpu_tests/test_input/schedules/sssp_CM_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_CM_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..718654bf
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_CM_kernel_fusion_schedule.gt
@@ -0,0 +1,8 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(CM);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/test/gpu_tests/test_input/schedules/sssp_CM_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_CM_schedule.gt
new file mode 100644
index 00000000..c6d6052e
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_CM_schedule.gt
@@ -0,0 +1,4 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(CM);
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_TWCE_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_TWCE_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..7bfdaab5
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_TWCE_kernel_fusion_schedule.gt
@@ -0,0 +1,8 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/test/gpu_tests/test_input/schedules/sssp_TWCE_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_TWCE_schedule.gt
new file mode 100644
index 00000000..d309bef4
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_TWCE_schedule.gt
@@ -0,0 +1,4 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_TWC_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_TWC_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..59721e6f
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_TWC_kernel_fusion_schedule.gt
@@ -0,0 +1,8 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWC);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/test/gpu_tests/test_input/schedules/sssp_TWC_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_TWC_schedule.gt
new file mode 100644
index 00000000..6c8f164f
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_TWC_schedule.gt
@@ -0,0 +1,4 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWC);
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_WM_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_WM_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..74ffe161
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_WM_kernel_fusion_schedule.gt
@@ -0,0 +1,8 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(WM);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/test/gpu_tests/test_input/schedules/sssp_WM_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_WM_schedule.gt
new file mode 100644
index 00000000..9df6f14b
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_WM_schedule.gt
@@ -0,0 +1,4 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(WM);
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_default_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_default_schedule.gt
new file mode 100644
index 00000000..225cce1b
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_default_schedule.gt
@@ -0,0 +1,3 @@
+schedule:
+	SimpleGPUSchedule s1;
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_strict_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_strict_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..f9c4b730
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_strict_kernel_fusion_schedule.gt
@@ -0,0 +1,8 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(STRICT);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/test/gpu_tests/test_input/schedules/sssp_strict_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_strict_schedule.gt
new file mode 100644
index 00000000..e458ca60
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_strict_schedule.gt
@@ -0,0 +1,4 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(STRICT);
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_vertex_based_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_vertex_based_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..67d773a5
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_vertex_based_kernel_fusion_schedule.gt
@@ -0,0 +1,7 @@
+schedule:
+	SimpleGPUSchedule s1;
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/test/gpu_tests/test_input/simple_graph_load.gt b/test/gpu_tests/test_input/simple_graph_load.gt
new file mode 100644
index 00000000..9af25ff8
--- /dev/null
+++ b/test/gpu_tests/test_input/simple_graph_load.gt
@@ -0,0 +1,8 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex, Vertex, int) = load (argv[1]);
+
+func main()
+	#s1# print edges.getVertices();	
+end
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
new file mode 100644
index 00000000..4f85f5e3
--- /dev/null
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -0,0 +1,171 @@
+
+
+
+#define VIRTUAL_WARP_SIZE (32)
+#define NUM_THREADS (1024)
+#define NUM_BLOCKS (80)
+#define CTA_SIZE (1024)
+#define WARP_SIZE (32)
+#define STAGE_1_SIZE (8)
+
+
+#include "gpu_intrinsics.h"
+#include <algorithm>
+
+
+#define USE_DEDUP 0
+#define SORT_NODES 0
+#include <assert.h>
+
+//#define DEBUG
+
+#ifdef DEBUG
+  #define ITER_COUNT (5)
+#else
+  #define ITER_COUNT (1)
+#endif
+
+gpu_runtime::GPUPriorityQueue<int> host_gpq;
+gpu_runtime::GPUPriorityQueue<int> __device__  device_gpq; 
+
+
+int32_t __device__ *SP;
+int32_t *__host_SP;
+int32_t *__device_SP;
+
+
+void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, int start_v) {
+        int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+        int num_threads = blockDim.x * gridDim.x;
+        int total_work = graph.num_vertices;
+        int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	if (thread_id == 0) {
+		//reset with the new data structure
+		SP[start_v] = 0;
+	}
+}
+
+/*bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
+        bool output2;
+        bool SP_trackving_var_1 = 0;
+	SP_trackving_var_1 = gpu_runtime::writeMin(&SP[dst], (SP[src] + weight));
+	output2 = SP_trackving_var_1;
+	if (SP[dst] >= (device_gpq.current_priority_ + device_gpq.delta_)) return false;
+	return output2;
+	}*/
+
+void __device__ deviceUpdateEdge(int32_t src, int32_t dst, int32_t weight, gpu_runtime::VertexFrontier output_frontier){
+  device_gpq.updatePriorityMin(&device_gpq, (SP[src] + weight), output_frontier, dst);
+}
+
+template <typename EdgeWeightType>
+void __device__ gpu_operator_body_3(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {
+	// Body of the actual operator code
+	EdgeWeightType weight = graph.d_edge_weight[edge_id];
+	deviceUpdateEdge(src, dst, weight, output_frontier);
+	/*if (updateEdge(src, dst, weight)){
+		gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, dst);
+		}*/
+}
+
+void __device__ SP_generated_vector_op_apply_func_0(int32_t v) {
+	SP[v] = 2147483647;
+}
+
+int main(int argc, char *argv[]) {
+	cudaSetDevice(0);
+	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
+	gpu_runtime::GraphT<int32_t> graph;
+	gpu_runtime::load_graph(graph, argv[1], false);
+	int32_t delta = atoi(argv[3]);
+	int32_t start_vertex = atoi(argv[2]);
+	
+	cudaMalloc(&__device_SP, gpu_runtime::builtin_getVertices(graph) * sizeof(int32_t));
+	cudaMemcpyToSymbol(SP, &__device_SP, sizeof(int32_t*), 0);
+	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
+	cudaDeviceSynchronize();
+	float total_time = 0;
+	for (int outer = 0; outer < ITER_COUNT; outer++) {
+		float iter_total = 0;
+		//this sets it to Sparse
+		//host_gpq.frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+		
+		gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(graph.getFullFrontier());
+		startTimer();
+
+		host_gpq.init(graph, __host_SP, __device_SP, 0, delta, start_vertex);
+
+		cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
+		gpu_runtime::cudaCheckLastError();
+		
+		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, start_vertex);
+		gpu_runtime::cudaCheckLastError();
+		
+		int iters = 0;	
+		cudaDeviceSynchronize();
+		float t = stopTimer();
+		//printf("Init time = %f\n", t);
+		iter_total+=t;
+
+		gpu_runtime::GPUPriorityQueue<int> * tmp_gpq;
+		cudaGetSymbolAddress(((void **)&tmp_gpq), device_gpq);
+		
+		while(! host_gpq.finished(tmp_gpq)){
+			startTimer();
+			iters++;
+			
+			gpu_runtime::VertexFrontier& frontier = host_gpq.dequeueReadySet(tmp_gpq);
+			
+			gpu_runtime::vertex_set_prepare_sparse(frontier);
+			cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
+			gpu_runtime::cudaCheckLastError();
+
+			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);
+			gpu_runtime::cudaCheckLastError();
+
+			gpu_runtime::swap_bytemaps(frontier);
+			// set the input to the prepare function
+			frontier.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
+			
+			cudaDeviceSynchronize();
+			t = stopTimer();
+
+			#ifdef DEBUG
+			//printf("Iter %d output_size = %d \n", iters, gpu_runtime::builtin_getVertexSetSize(frontier));
+			#endif
+			
+			iter_total += t;
+		}
+
+
+		#ifdef DEBUG
+		printf("Num iters = %d\n", iters);
+		printf("Time elapsed = %f\n", iter_total);
+		#endif
+		
+		total_time += iter_total;
+
+	}
+
+	#ifdef DEBUG
+	printf("Total time = %f\n", total_time);
+	#endif
+	
+	if (argc > 3)
+		if (argv[4][0] == 'v'){ 
+			//FILE *output = fopen("output.txt", "w");
+			cudaMemcpy(__host_SP, __device_SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
+			#ifdef DEBUG
+			FILE *output = fopen("output.txt", "w");
+			#endif
+			
+			for (int i = 0; i < graph.num_vertices; i++){
+				#ifdef DEBUG
+				fprintf(output, "%d, %d\n", i, __host_SP[i]);
+				#else
+				printf("%d\n", __host_SP[i]);
+                                #endif
+			}
+		}
+	return 0;
+}
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping_frontier_byval.cu b/test/gpu_tests/test_input/sssp_delta_stepping_frontier_byval.cu
new file mode 100644
index 00000000..35a10024
--- /dev/null
+++ b/test/gpu_tests/test_input/sssp_delta_stepping_frontier_byval.cu
@@ -0,0 +1,172 @@
+
+
+
+#define VIRTUAL_WARP_SIZE (32)
+#define NUM_THREADS (1024)
+#define NUM_BLOCKS (80)
+#define CTA_SIZE (1024)
+#define WARP_SIZE (32)
+#define STAGE_1_SIZE (8)
+
+
+#include "gpu_intrinsics.h"
+#include <algorithm>
+
+
+#define USE_DEDUP 0
+#define SORT_NODES 0
+#include <assert.h>
+
+//#define DEBUG
+
+#ifdef DEBUG
+  #define ITER_COUNT (5)
+#else
+  #define ITER_COUNT (1)
+#endif
+
+gpu_runtime::GPUPriorityQueue<int> host_gpq;
+gpu_runtime::GPUPriorityQueue<int> __device__  device_gpq; 
+
+
+int32_t __device__ *SP;
+int32_t *__host_SP;
+int32_t *__device_SP;
+
+
+void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, int start_v) {
+        int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+        int num_threads = blockDim.x * gridDim.x;
+        int total_work = graph.num_vertices;
+        int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	if (thread_id == 0) {
+		//reset with the new data structure
+		SP[start_v] = 0;
+	}
+}
+
+/*bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
+        bool output2;
+        bool SP_trackving_var_1 = 0;
+	SP_trackving_var_1 = gpu_runtime::writeMin(&SP[dst], (SP[src] + weight));
+	output2 = SP_trackving_var_1;
+	if (SP[dst] >= (device_gpq.current_priority_ + device_gpq.delta_)) return false;
+	return output2;
+	}*/
+
+void __device__ deviceUpdateEdge(int32_t src, int32_t dst, int32_t weight, gpu_runtime::VertexFrontier output_frontier){
+  device_gpq.updatePriorityMin(&device_gpq, (SP[src] + weight), output_frontier, dst);
+}
+
+template <typename EdgeWeightType>
+void __device__ gpu_operator_body_3(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {
+	// Body of the actual operator code
+	EdgeWeightType weight = graph.d_edge_weight[edge_id];
+	deviceUpdateEdge(src, dst, weight, output_frontier);
+	/*if (updateEdge(src, dst, weight)){
+		gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, dst);
+		}*/
+}
+
+void __device__ SP_generated_vector_op_apply_func_0(int32_t v) {
+	SP[v] = 2147483647;
+}
+
+int main(int argc, char *argv[]) {
+	cudaSetDevice(0);
+	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
+	gpu_runtime::GraphT<int32_t> graph;
+	gpu_runtime::load_graph(graph, argv[1], false);
+	int32_t delta = atoi(argv[3]);
+	int32_t start_vertex = atoi(argv[2]);
+	
+	cudaMalloc(&__device_SP, gpu_runtime::builtin_getVertices(graph) * sizeof(int32_t));
+	cudaMemcpyToSymbol(SP, &__device_SP, sizeof(int32_t*), 0);
+	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
+	cudaDeviceSynchronize();
+	float total_time = 0;
+	for (int outer = 0; outer < ITER_COUNT; outer++) {
+		float iter_total = 0;
+		//this sets it to Sparse
+		//host_gpq.frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+		
+		gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(graph.getFullFrontier());
+		startTimer();
+
+		host_gpq.init(graph, __host_SP, __device_SP, 0, delta, start_vertex);
+
+		cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
+		gpu_runtime::cudaCheckLastError();
+		
+		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, start_vertex);
+		gpu_runtime::cudaCheckLastError();
+		
+		int iters = 0;	
+		cudaDeviceSynchronize();
+		float t = stopTimer();
+		//printf("Init time = %f\n", t);
+		iter_total+=t;
+
+		gpu_runtime::GPUPriorityQueue<int> * tmp_gpq;
+		cudaGetSymbolAddress(((void **)&tmp_gpq), device_gpq);
+		
+		while(! host_gpq.finished(tmp_gpq)){
+			startTimer();
+			iters++;
+			
+			gpu_runtime::VertexFrontier frontier = host_gpq.dequeueReadySet(tmp_gpq);
+			
+			gpu_runtime::vertex_set_prepare_sparse(frontier);
+			host_gpq.frontier_ = frontier;
+			cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
+			gpu_runtime::cudaCheckLastError();
+
+			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, host_gpq.frontier_);
+			gpu_runtime::cudaCheckLastError();
+
+			gpu_runtime::swap_bytemaps(host_gpq.frontier_);
+			// set the input to the prepare function
+			host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
+			
+			cudaDeviceSynchronize();
+			t = stopTimer();
+
+			#ifdef DEBUG
+			//printf("Iter %d output_size = %d \n", iters, gpu_runtime::builtin_getVertexSetSize(frontier));
+			#endif
+			
+			iter_total += t;
+		}
+
+
+		#ifdef DEBUG
+		printf("Num iters = %d\n", iters);
+		printf("Time elapsed = %f\n", iter_total);
+		#endif
+		
+		total_time += iter_total;
+
+	}
+
+	#ifdef DEBUG
+	printf("Total time = %f\n", total_time);
+	#endif
+	
+	if (argc > 3)
+		if (argv[4][0] == 'v'){ 
+			//FILE *output = fopen("output.txt", "w");
+			cudaMemcpy(__host_SP, __device_SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
+			#ifdef DEBUG
+			FILE *output = fopen("output.txt", "w");
+			#endif
+			
+			for (int i = 0; i < graph.num_vertices; i++){
+				#ifdef DEBUG
+				fprintf(output, "%d, %d\n", i, __host_SP[i]);
+				#else
+				printf("%d\n", __host_SP[i]);
+                                #endif
+			}
+		}
+	return 0;
+}
diff --git a/test/gpu_tests/test_input/sssp_lp.cu b/test/gpu_tests/test_input/sssp_lp.cu
new file mode 100644
index 00000000..37688991
--- /dev/null
+++ b/test/gpu_tests/test_input/sssp_lp.cu
@@ -0,0 +1,358 @@
+#include "gpu_intrinsics.h"
+#include <algorithm>
+
+#define ITER_COUNT (1)
+#define USE_DEDUP 0
+#define SORT_NODES 0
+#include <assert.h>
+#include <vector>
+#include <queue>
+
+
+typedef struct {
+	int32_t *SP;
+
+	int32_t *frontier1;
+	int32_t *frontier2;
+
+	int32_t *frontier1_size;
+	int32_t *frontier2_size;
+	int32_t *iters;
+
+	int32_t *output_size;
+
+	int32_t num_blocks;
+
+	int32_t *node_borders;
+	int32_t *edge_borders;
+
+	int32_t *worklist;
+	int32_t *old_indices;
+}algo_state;
+
+//struct timeval start_time_;
+//struct timeval elapsed_time_;
+
+// void startTimer(){
+// 	gettimeofday(&start_time_, NULL);
+// }
+
+// float stopTimer(){
+// 	gettimeofday(&elapsed_time_, NULL);
+// 	elapsed_time_.tv_sec  -= start_time_.tv_sec;
+// 	elapsed_time_.tv_usec -= start_time_.tv_usec;
+// 	return elapsed_time_.tv_sec + elapsed_time_.tv_usec/1e6;
+// }
+
+void cudaCheckLastError(void) {
+	cudaError_t err = cudaGetLastError();
+	if (err != cudaSuccess) 
+		printf("Error: %s\n", cudaGetErrorString(err));
+}
+
+
+#define VIRTUAL_WARP_SIZE (32)
+#define NUM_THREADS (1024)
+#define NUM_BLOCKS (80)
+#define CTA_SIZE (1024)
+#define WARP_SIZE (32)
+#define STAGE_1_SIZE (8)
+
+void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int32_t start_vertex) {
+        int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+        int num_threads = blockDim.x * gridDim.x;
+        int total_work = graph.num_vertices;
+        int work_per_thread = (total_work + num_threads - 1)/num_threads;
+        for (int i = 0; i < work_per_thread; i++) {
+                int id = num_threads * i + thread_id;
+                if (id < total_work) {
+			device_state.SP[id] = INT_MAX;
+			device_state.iters[id] = 0;
+                }
+        }
+	if (thread_id == 0) {
+		device_state.SP[start_vertex] = 0;
+		//starting point is set to 0 
+		device_state.frontier1[0] = start_vertex;
+		*device_state.frontier1_size = 1;
+		*device_state.frontier2_size = 0;
+	}
+}
+__device__ inline int warp_bcast(int v, int leader) { return __shfl_sync(__activemask(), v, leader); }
+__device__ inline int atomicAggInc(int *ctr) {
+	int32_t lane_id = threadIdx.x % 32;
+	
+        int mask = __activemask();
+        int leader = __ffs(mask) - 1;
+        int res;
+        if(lane_id == leader)
+                res = atomicAdd(ctr, __popc(mask));
+        res = warp_bcast(res, leader);
+
+        return (res + __popc(mask & ((1 << lane_id) - 1)));
+}
+__device__ void enqueueVertex(int32_t v, algo_state &device_state, int32_t curr_iter) {
+	if (device_state.iters[v] == curr_iter)
+		return;
+	device_state.iters[v] = curr_iter;
+	int32_t pos = atomicAggInc(device_state.frontier2_size);
+	device_state.frontier2[pos] = v;
+}
+
+void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int32_t curr_iter) {
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	
+	int lane_id = thread_id % 32;
+
+	__shared__ int32_t stage2_queue[CTA_SIZE];
+	__shared__ int32_t stage3_queue[CTA_SIZE];
+	__shared__ int32_t stage_queue_sizes[3];
+	if (threadIdx.x == 0) {
+		stage_queue_sizes[0] = 0;
+		stage_queue_sizes[1] = 0;
+		stage_queue_sizes[2] = 0;
+	}
+	__syncthreads();
+	
+	
+	__shared__ int32_t stage2_offset[CTA_SIZE];
+	__shared__ int32_t stage3_offset[CTA_SIZE];
+
+	__shared__ int32_t stage2_size[CTA_SIZE];
+	__shared__ int32_t stage3_size[CTA_SIZE];
+	
+
+	int32_t total_vertices = device_state.frontier1_size[0];	
+
+	int32_t my_vertex_idx = thread_id / (STAGE_1_SIZE);
+	int32_t d;
+	int32_t s1_offset;
+	int32_t my_vertex;
+	int32_t row_offset;
+	if (my_vertex_idx < total_vertices) {
+		my_vertex = device_state.frontier1[my_vertex_idx];
+		// Step 1 segreggate vertices into shared buffers	
+		if (thread_id % (STAGE_1_SIZE) == 0 ) {
+			d = graph.d_get_degree(my_vertex);
+			row_offset = graph.d_src_offsets[my_vertex];	
+			int32_t s3_size = d/CTA_SIZE;
+			d = d - s3_size * CTA_SIZE;
+			if (s3_size) {
+				int32_t pos = atomicAggInc(&stage_queue_sizes[2]);
+				stage3_queue[pos] = my_vertex;			
+				stage3_size[pos] = s3_size * CTA_SIZE;
+				// stage3_offset[pos] = 0; // Not required because always 0
+				stage3_offset[pos] = row_offset;	
+			}
+			
+			int32_t s2_size = d/WARP_SIZE;
+			d = d - s2_size * WARP_SIZE;
+			
+			if (s2_size) {
+				int32_t pos = atomicAggInc(&stage_queue_sizes[1]);
+				stage2_queue[pos] = my_vertex;
+				stage2_offset[pos] = s3_size * CTA_SIZE + row_offset;
+				stage2_size[pos] = s2_size * WARP_SIZE;
+			}
+			s1_offset = s3_size * CTA_SIZE + s2_size * WARP_SIZE + row_offset;
+		}
+	}else
+		my_vertex = -1;
+
+	__syncthreads();
+	
+	d = __shfl_sync((uint32_t)-1, d, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	s1_offset = __shfl_sync((uint32_t)-1, s1_offset, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	int32_t src_distance;
+	if (my_vertex_idx < total_vertices) {
+		// STAGE 1	
+		my_vertex = device_state.frontier1[my_vertex_idx];
+		src_distance = device_state.SP[my_vertex];
+		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < d + s1_offset; neigh_id += STAGE_1_SIZE) {
+			// DO ACTUAL SSSP
+			int32_t dst = graph.d_edge_dst[neigh_id];
+			int32_t new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			if (new_dst < device_state.SP[dst]) {
+				atomicMin(&device_state.SP[dst], new_dst);
+				enqueueVertex(dst, device_state, curr_iter);
+			}	
+		}		
+	}	
+	// STAGE 2 -- stage 2 is dynamically balanced
+	while (1) {
+		int32_t to_process;
+		if (lane_id == 0) {
+			to_process = atomicSub(&stage_queue_sizes[1], 1) - 1;	
+		}
+		to_process = __shfl_sync((uint32_t)-1, to_process, 0, 32);
+		if (to_process < 0)
+			break;
+		my_vertex = stage2_queue[to_process];
+		d = stage2_size[to_process];
+		int32_t s2_offset = stage2_offset[to_process];	
+		src_distance = device_state.SP[my_vertex];
+		
+		for (int32_t neigh_id = s2_offset + (lane_id); neigh_id < d + s2_offset; neigh_id += WARP_SIZE) {
+			// DO ACTUAL SSSP
+			int dst = graph.d_edge_dst[neigh_id];
+			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			if (new_dst < device_state.SP[dst]) {
+				atomicMin(&device_state.SP[dst], new_dst);
+				enqueueVertex(dst, device_state, curr_iter);
+			}	
+		}
+	}	
+
+	// STAGE 3 -- all threads have to do all, no need for LB
+	for (int32_t wid = 0; wid < stage_queue_sizes[2]; wid ++) {
+		my_vertex = stage3_queue[wid];
+		d = stage3_size[wid];
+		int32_t s3_offset = stage3_offset[wid];
+		src_distance = device_state.SP[my_vertex];
+		
+		for (int32_t neigh_id = s3_offset + (threadIdx.x); neigh_id < d + s3_offset; neigh_id += CTA_SIZE) {
+			// DO ACTUAL SSSP
+			int dst = graph.d_edge_dst[neigh_id];
+			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			if (new_dst < device_state.SP[dst]) {
+				atomicMin(&device_state.SP[dst], new_dst);
+				enqueueVertex(dst, device_state, curr_iter);
+			}	
+		}
+	}	
+}
+void __global__ update_nodes (gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	int num_threads = blockDim.x * gridDim.x;
+	
+	int total_work = graph.num_vertices;
+	int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	
+	for (int i = 0; i < work_per_thread; i++) {
+		int32_t node_id = thread_id + i * num_threads;
+		if (node_id < graph.num_vertices) {
+			if (device_state.frontier2[node_id]) {
+				device_state.frontier2[node_id] = 0;
+				int pos = atomicAdd(device_state.frontier1_size, 1);
+				device_state.frontier1[pos] = node_id;
+			}
+		}
+	}
+
+}
+void allocate_state(algo_state &host_state, algo_state &device_state, gpu_runtime::GraphT<int32_t> &graph) {
+	host_state.SP = new int[graph.num_vertices];
+	host_state.output_size = new int32_t[1];
+
+	host_state.frontier1_size = new int32_t[1];
+	host_state.frontier1 = new int32_t[graph.num_vertices];
+
+	
+	cudaMalloc(&device_state.SP, sizeof(int32_t)*graph.num_vertices);	
+
+	cudaMalloc(&device_state.frontier1, sizeof(int32_t)*graph.num_vertices * 6);	
+	cudaMalloc(&device_state.frontier2, sizeof(int32_t)*graph.num_vertices * 6);	
+	cudaMalloc(&device_state.iters, sizeof(int32_t)*graph.num_vertices);	
+
+	cudaMalloc(&device_state.frontier1_size, sizeof(int32_t));	
+	cudaMalloc(&device_state.frontier2_size, sizeof(int32_t));	
+
+	cudaMalloc(&device_state.output_size, sizeof(int32_t));
+
+
+	cudaMalloc(&device_state.worklist, sizeof(int32_t));
+}
+
+void swap_pointers(int32_t **a, int32_t **b) {
+	int32_t* t = *a;
+	*a = *b;
+	*b = t;
+}
+void swap_queues(algo_state &device_state) {
+	swap_pointers(&device_state.frontier1, &device_state.frontier2);
+	swap_pointers(&device_state.frontier1_size, &device_state.frontier2_size);
+}
+
+
+template <typename EdgeWeightType>
+void __device__ gpu_operator_body_3(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {
+	// Body of the actual operator code
+	EdgeWeightType weight = graph.d_edge_weight[edge_id];
+	//if (updateEdge(src, dst, weight)) {
+		gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);
+		//}
+}
+
+
+int main(int argc, char *argv[]) {
+	cudaSetDevice(0);
+	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
+	gpu_runtime::GraphT<int32_t> graph;
+	gpu_runtime::load_graph(graph, argv[1], false);
+	int32_t start_vertex = atoi(argv[2]);
+
+	algo_state host_state, device_state;
+
+	allocate_state(host_state, device_state, graph);
+
+	cudaDeviceSynchronize();
+		
+	float total_time = 0;
+	for (int outer = 0; outer < ITER_COUNT; outer++) {
+		float iter_total = 0;
+		startTimer();
+		
+		startTimer();
+		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state, start_vertex);		
+		int iters = 0;	
+		cudaDeviceSynchronize();
+		float t = stopTimer();
+		//printf("Init time = %f\n", t);
+		iter_total+=t;
+
+		host_state.frontier1_size[0] = 1;
+		while(*host_state.frontier1_size) {
+			startTimer();
+			iters++;
+			int num_threads = *host_state.frontier1_size *(STAGE_1_SIZE);
+			int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+			
+			update_edges<<<num_cta, CTA_SIZE>>>(graph, device_state, iters);
+			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(edges, frontier, frontier);
+			
+			host_state.frontier1_size[0] = 0;
+			cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
+			
+			swap_queues(device_state);
+				
+			cudaCheckLastError();
+			cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t), cudaMemcpyDeviceToHost);
+
+			t = stopTimer();
+			//printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
+			iter_total += t;
+		}
+		
+		//printf("Num iters = %d\n", iters);
+		//printf("Time elapsed = %f\n", iter_total);
+		total_time += iter_total;
+
+	}
+	//printf("Total time = %f\n", total_time);
+	if (argc > 3)
+		if (argv[3][0] == 'v'){ 
+			//FILE *output = fopen("output.txt", "w");
+			cudaMemcpy(host_state.SP, device_state.SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
+			for (int i = 0; i < graph.num_vertices; i++)
+				//fprintf(output, "%d, %d\n", i, host_state.SP[i]);
+				printf("%d\n", host_state.SP[i]);
+		}else if (argv[2][0] == 'c'){
+			/*
+			for (int i = 0; i < NUM_BLOCKS * NUM_THREADS; i++)
+				printf("%d: %d\n", i, counters[i]);
+			*/
+		}
+
+	return 0;
+
+}
diff --git a/test/graphs/4.mtx b/test/graphs/simple_mtx.mtx
similarity index 100%
rename from test/graphs/4.mtx
rename to test/graphs/simple_mtx.mtx
diff --git a/test/python/pybind_test.py b/test/python/pybind_test.py
index ed25aaf6..5998a584 100644
--- a/test/python/pybind_test.py
+++ b/test/python/pybind_test.py
@@ -121,7 +121,7 @@ def test_pybind_pr_with_vector_input(self):
 
     def test_pybind_pr_load_file(self):
         module = graphit.compile_and_load(self.root_test_input_dir + "export_pr_with_return.gt")
-        graph = csr_matrix(scipy.io.mmread(self.root_test_graph_dir+"4.mtx"))
+        graph = csr_matrix(scipy.io.mmread(self.root_test_graph_dir+"simple_mtx.mtx"))
         ranks = module.export_func(graph)
         self.assertEqual(len(ranks), graph.shape[0])
         self.assertTrue(abs(np.sum(ranks)-1.0) < 0.1)
diff --git a/test/verifiers/bc_verifier.cpp b/test/verifiers/bc_verifier.cpp
index eb69d11f..292440b4 100644
--- a/test/verifiers/bc_verifier.cpp
+++ b/test/verifiers/bc_verifier.cpp
@@ -68,7 +68,7 @@ bool BCVerifier(const Graph &g, NodeID source, NodeID num_iters,
     // Compare scores
     bool all_ok = true;
     for (NodeID n : g.vertices()) {
-        if (abs(scores[n] - scores_to_test[n]) > 0.000001) {
+        if (abs(scores[n] - scores_to_test[n]) > 0.001) {
             cout << n << ": " << scores[n] << " != " << scores_to_test[n] << endl;
             all_ok = false;
         }