diff --git a/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg.hjson b/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg.hjson
index c80b34c..5c3537d 100644
--- a/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg.hjson
+++ b/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg.hjson
@@ -415,6 +415,18 @@
             desc: "Indicates the cluster is computing a kernel."
         }]
     },
+    {
+        name: "SPATZ_CYCLE",
+        desc: '''Store cycle counts of kernels'''
+        swaccess: "rw",
+        hwaccess: "hrw",
+        resval: "0",
+        fields: [{
+            bits: "31:0",
+            name: "SPATZ_CYC",
+            desc: "Store cycle counts of kernels."
+        }]
+    },
     {
         name: "CLUSTER_BOOT_CONTROL",
         desc: '''Controls the cluster boot process.'''
diff --git a/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg_pkg.sv b/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg_pkg.sv
index c39af85..bc9db5f 100644
--- a/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg_pkg.sv
+++ b/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg_pkg.sv
@@ -143,6 +143,10 @@ package spatz_cluster_peripheral_reg_pkg;
     logic        q;
   } spatz_cluster_peripheral_reg2hw_spatz_status_reg_t;
 
+  typedef struct packed {
+    logic [31:0] q;
+  } spatz_cluster_peripheral_reg2hw_spatz_cycle_reg_t;
+
   typedef struct packed {
     logic [31:0] q;
   } spatz_cluster_peripheral_reg2hw_cluster_boot_control_reg_t;
@@ -175,6 +179,11 @@ package spatz_cluster_peripheral_reg_pkg;
     logic [31:0] d;
   } spatz_cluster_peripheral_hw2reg_hw_barrier_reg_t;
 
+  typedef struct packed {
+    logic [31:0] d;
+    logic        de;
+  } spatz_cluster_peripheral_hw2reg_spatz_cycle_reg_t;
+
   typedef struct packed {
     logic        d;
     logic        de;
@@ -191,14 +200,15 @@ package spatz_cluster_peripheral_reg_pkg;
 
   // Register -> HW type
   typedef struct packed {
-    spatz_cluster_peripheral_reg2hw_perf_counter_enable_mreg_t [1:0] perf_counter_enable; // [326:265]
-    spatz_cluster_peripheral_reg2hw_hart_select_mreg_t [1:0] hart_select; // [264:245]
-    spatz_cluster_peripheral_reg2hw_perf_counter_mreg_t [1:0] perf_counter; // [244:147]
-    spatz_cluster_peripheral_reg2hw_cl_clint_set_reg_t cl_clint_set; // [146:114]
-    spatz_cluster_peripheral_reg2hw_cl_clint_clear_reg_t cl_clint_clear; // [113:81]
-    spatz_cluster_peripheral_reg2hw_hw_barrier_reg_t hw_barrier; // [80:49]
-    spatz_cluster_peripheral_reg2hw_icache_prefetch_enable_reg_t icache_prefetch_enable; // [48:48]
-    spatz_cluster_peripheral_reg2hw_spatz_status_reg_t spatz_status; // [47:47]
+    spatz_cluster_peripheral_reg2hw_perf_counter_enable_mreg_t [1:0] perf_counter_enable; // [358:297]
+    spatz_cluster_peripheral_reg2hw_hart_select_mreg_t [1:0] hart_select; // [296:277]
+    spatz_cluster_peripheral_reg2hw_perf_counter_mreg_t [1:0] perf_counter; // [276:179]
+    spatz_cluster_peripheral_reg2hw_cl_clint_set_reg_t cl_clint_set; // [178:146]
+    spatz_cluster_peripheral_reg2hw_cl_clint_clear_reg_t cl_clint_clear; // [145:113]
+    spatz_cluster_peripheral_reg2hw_hw_barrier_reg_t hw_barrier; // [112:81]
+    spatz_cluster_peripheral_reg2hw_icache_prefetch_enable_reg_t icache_prefetch_enable; // [80:80]
+    spatz_cluster_peripheral_reg2hw_spatz_status_reg_t spatz_status; // [79:79]
+    spatz_cluster_peripheral_reg2hw_spatz_cycle_reg_t spatz_cycle; // [78:47]
     spatz_cluster_peripheral_reg2hw_cluster_boot_control_reg_t cluster_boot_control; // [46:15]
     spatz_cluster_peripheral_reg2hw_cluster_eoc_exit_reg_t cluster_eoc_exit; // [14:14]
     spatz_cluster_peripheral_reg2hw_cfg_l1d_spm_reg_t cfg_l1d_spm; // [13:4]
@@ -209,8 +219,9 @@ package spatz_cluster_peripheral_reg_pkg;
 
   // HW -> register type
   typedef struct packed {
-    spatz_cluster_peripheral_hw2reg_perf_counter_mreg_t [1:0] perf_counter; // [132:37]
-    spatz_cluster_peripheral_hw2reg_hw_barrier_reg_t hw_barrier; // [36:5]
+    spatz_cluster_peripheral_hw2reg_perf_counter_mreg_t [1:0] perf_counter; // [165:70]
+    spatz_cluster_peripheral_hw2reg_hw_barrier_reg_t hw_barrier; // [69:38]
+    spatz_cluster_peripheral_hw2reg_spatz_cycle_reg_t spatz_cycle; // [37:5]
     spatz_cluster_peripheral_hw2reg_l1d_spm_commit_reg_t l1d_spm_commit; // [4:3]
     spatz_cluster_peripheral_hw2reg_l1d_insn_commit_reg_t l1d_insn_commit; // [2:1]
     spatz_cluster_peripheral_hw2reg_l1d_flush_status_reg_t l1d_flush_status; // [0:0]
@@ -228,13 +239,14 @@ package spatz_cluster_peripheral_reg_pkg;
   parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_HW_BARRIER_OFFSET = 8'h 40;
   parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_ICACHE_PREFETCH_ENABLE_OFFSET = 8'h 48;
   parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_SPATZ_STATUS_OFFSET = 8'h 50;
-  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL_OFFSET = 8'h 58;
-  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_CLUSTER_EOC_EXIT_OFFSET = 8'h 60;
-  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM_OFFSET = 8'h 68;
-  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN_OFFSET = 8'h 70;
-  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_L1D_SPM_COMMIT_OFFSET = 8'h 78;
-  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_L1D_INSN_COMMIT_OFFSET = 8'h 80;
-  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_L1D_FLUSH_STATUS_OFFSET = 8'h 88;
+  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE_OFFSET = 8'h 58;
+  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL_OFFSET = 8'h 60;
+  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_CLUSTER_EOC_EXIT_OFFSET = 8'h 68;
+  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM_OFFSET = 8'h 70;
+  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN_OFFSET = 8'h 78;
+  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_L1D_SPM_COMMIT_OFFSET = 8'h 80;
+  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_L1D_INSN_COMMIT_OFFSET = 8'h 88;
+  parameter logic [BlockAw-1:0] SPATZ_CLUSTER_PERIPHERAL_L1D_FLUSH_STATUS_OFFSET = 8'h 90;
 
   // Reset values for hwext registers and their fields
   parameter logic [47:0] SPATZ_CLUSTER_PERIPHERAL_PERF_COUNTER_0_RESVAL = 48'h 0;
@@ -258,6 +270,7 @@ package spatz_cluster_peripheral_reg_pkg;
     SPATZ_CLUSTER_PERIPHERAL_HW_BARRIER,
     SPATZ_CLUSTER_PERIPHERAL_ICACHE_PREFETCH_ENABLE,
     SPATZ_CLUSTER_PERIPHERAL_SPATZ_STATUS,
+    SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE,
     SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL,
     SPATZ_CLUSTER_PERIPHERAL_CLUSTER_EOC_EXIT,
     SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM,
@@ -268,7 +281,7 @@ package spatz_cluster_peripheral_reg_pkg;
   } spatz_cluster_peripheral_id_e;
 
   // Register width information to check illegal writes
-  parameter logic [3:0] SPATZ_CLUSTER_PERIPHERAL_PERMIT [18] = '{
+  parameter logic [3:0] SPATZ_CLUSTER_PERIPHERAL_PERMIT [19] = '{
     4'b 1111, // index[ 0] SPATZ_CLUSTER_PERIPHERAL_PERF_COUNTER_ENABLE_0
     4'b 1111, // index[ 1] SPATZ_CLUSTER_PERIPHERAL_PERF_COUNTER_ENABLE_1
     4'b 0011, // index[ 2] SPATZ_CLUSTER_PERIPHERAL_HART_SELECT_0
@@ -280,13 +293,14 @@ package spatz_cluster_peripheral_reg_pkg;
     4'b 1111, // index[ 8] SPATZ_CLUSTER_PERIPHERAL_HW_BARRIER
     4'b 0001, // index[ 9] SPATZ_CLUSTER_PERIPHERAL_ICACHE_PREFETCH_ENABLE
     4'b 0001, // index[10] SPATZ_CLUSTER_PERIPHERAL_SPATZ_STATUS
-    4'b 1111, // index[11] SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL
-    4'b 0001, // index[12] SPATZ_CLUSTER_PERIPHERAL_CLUSTER_EOC_EXIT
-    4'b 0011, // index[13] SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM
-    4'b 0001, // index[14] SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN
-    4'b 0001, // index[15] SPATZ_CLUSTER_PERIPHERAL_L1D_SPM_COMMIT
-    4'b 0001, // index[16] SPATZ_CLUSTER_PERIPHERAL_L1D_INSN_COMMIT
-    4'b 0001  // index[17] SPATZ_CLUSTER_PERIPHERAL_L1D_FLUSH_STATUS
+    4'b 1111, // index[11] SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE
+    4'b 1111, // index[12] SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL
+    4'b 0001, // index[13] SPATZ_CLUSTER_PERIPHERAL_CLUSTER_EOC_EXIT
+    4'b 0011, // index[14] SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM
+    4'b 0001, // index[15] SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN
+    4'b 0001, // index[16] SPATZ_CLUSTER_PERIPHERAL_L1D_SPM_COMMIT
+    4'b 0001, // index[17] SPATZ_CLUSTER_PERIPHERAL_L1D_INSN_COMMIT
+    4'b 0001  // index[18] SPATZ_CLUSTER_PERIPHERAL_L1D_FLUSH_STATUS
   };
 
 endpackage
diff --git a/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg_top.sv b/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg_top.sv
index d019907..3ec047a 100644
--- a/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg_top.sv
+++ b/hw/system/spatz_cluster/src/spatz_cluster_peripheral/spatz_cluster_peripheral_reg_top.sv
@@ -278,6 +278,9 @@ module spatz_cluster_peripheral_reg_top #(
   logic icache_prefetch_enable_we;
   logic spatz_status_wd;
   logic spatz_status_we;
+  logic [31:0] spatz_cycle_qs;
+  logic [31:0] spatz_cycle_wd;
+  logic spatz_cycle_we;
   logic [31:0] cluster_boot_control_qs;
   logic [31:0] cluster_boot_control_wd;
   logic cluster_boot_control_we;
@@ -2110,6 +2113,33 @@ module spatz_cluster_peripheral_reg_top #(
   );
 
 
+  // R[spatz_cycle]: V(False)
+
+  prim_subreg #(
+    .DW      (32),
+    .SWACCESS("RW"),
+    .RESVAL  (32'h0)
+  ) u_spatz_cycle (
+    .clk_i   (clk_i    ),
+    .rst_ni  (rst_ni  ),
+
+    // from register interface
+    .we     (spatz_cycle_we),
+    .wd     (spatz_cycle_wd),
+
+    // from internal hardware
+    .de     (hw2reg.spatz_cycle.de),
+    .d      (hw2reg.spatz_cycle.d ),
+
+    // to internal hardware
+    .qe     (),
+    .q      (reg2hw.spatz_cycle.q ),
+
+    // to register interface (read)
+    .qs     (spatz_cycle_qs)
+  );
+
+
   // R[cluster_boot_control]: V(False)
 
   prim_subreg #(
@@ -2290,7 +2320,7 @@ module spatz_cluster_peripheral_reg_top #(
 
 
 
-  logic [17:0] addr_hit;
+  logic [18:0] addr_hit;
   always_comb begin
     addr_hit = '0;
     addr_hit[ 0] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_PERF_COUNTER_ENABLE_0_OFFSET);
@@ -2304,13 +2334,14 @@ module spatz_cluster_peripheral_reg_top #(
     addr_hit[ 8] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_HW_BARRIER_OFFSET);
     addr_hit[ 9] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_ICACHE_PREFETCH_ENABLE_OFFSET);
     addr_hit[10] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_SPATZ_STATUS_OFFSET);
-    addr_hit[11] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL_OFFSET);
-    addr_hit[12] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_CLUSTER_EOC_EXIT_OFFSET);
-    addr_hit[13] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM_OFFSET);
-    addr_hit[14] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN_OFFSET);
-    addr_hit[15] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_L1D_SPM_COMMIT_OFFSET);
-    addr_hit[16] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_L1D_INSN_COMMIT_OFFSET);
-    addr_hit[17] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_L1D_FLUSH_STATUS_OFFSET);
+    addr_hit[11] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE_OFFSET);
+    addr_hit[12] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL_OFFSET);
+    addr_hit[13] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_CLUSTER_EOC_EXIT_OFFSET);
+    addr_hit[14] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM_OFFSET);
+    addr_hit[15] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN_OFFSET);
+    addr_hit[16] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_L1D_SPM_COMMIT_OFFSET);
+    addr_hit[17] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_L1D_INSN_COMMIT_OFFSET);
+    addr_hit[18] = (reg_addr == SPATZ_CLUSTER_PERIPHERAL_L1D_FLUSH_STATUS_OFFSET);
   end
 
   assign addrmiss = (reg_re || reg_we) ? ~|addr_hit : 1'b0 ;
@@ -2335,7 +2366,8 @@ module spatz_cluster_peripheral_reg_top #(
                (addr_hit[14] & (|(SPATZ_CLUSTER_PERIPHERAL_PERMIT[14] & ~reg_be))) |
                (addr_hit[15] & (|(SPATZ_CLUSTER_PERIPHERAL_PERMIT[15] & ~reg_be))) |
                (addr_hit[16] & (|(SPATZ_CLUSTER_PERIPHERAL_PERMIT[16] & ~reg_be))) |
-               (addr_hit[17] & (|(SPATZ_CLUSTER_PERIPHERAL_PERMIT[17] & ~reg_be)))));
+               (addr_hit[17] & (|(SPATZ_CLUSTER_PERIPHERAL_PERMIT[17] & ~reg_be))) |
+               (addr_hit[18] & (|(SPATZ_CLUSTER_PERIPHERAL_PERMIT[18] & ~reg_be)))));
   end
 
   assign perf_counter_enable_0_cycle_0_we = addr_hit[0] & reg_we & !reg_error;
@@ -2552,25 +2584,28 @@ module spatz_cluster_peripheral_reg_top #(
   assign spatz_status_we = addr_hit[10] & reg_we & !reg_error;
   assign spatz_status_wd = reg_wdata[0];
 
-  assign cluster_boot_control_we = addr_hit[11] & reg_we & !reg_error;
+  assign spatz_cycle_we = addr_hit[11] & reg_we & !reg_error;
+  assign spatz_cycle_wd = reg_wdata[31:0];
+
+  assign cluster_boot_control_we = addr_hit[12] & reg_we & !reg_error;
   assign cluster_boot_control_wd = reg_wdata[31:0];
 
-  assign cluster_eoc_exit_we = addr_hit[12] & reg_we & !reg_error;
+  assign cluster_eoc_exit_we = addr_hit[13] & reg_we & !reg_error;
   assign cluster_eoc_exit_wd = reg_wdata[0];
 
-  assign cfg_l1d_spm_we = addr_hit[13] & reg_we & !reg_error;
+  assign cfg_l1d_spm_we = addr_hit[14] & reg_we & !reg_error;
   assign cfg_l1d_spm_wd = reg_wdata[9:0];
 
-  assign cfg_l1d_insn_we = addr_hit[14] & reg_we & !reg_error;
+  assign cfg_l1d_insn_we = addr_hit[15] & reg_we & !reg_error;
   assign cfg_l1d_insn_wd = reg_wdata[1:0];
 
-  assign l1d_spm_commit_we = addr_hit[15] & reg_we & !reg_error;
+  assign l1d_spm_commit_we = addr_hit[16] & reg_we & !reg_error;
   assign l1d_spm_commit_wd = reg_wdata[0];
 
-  assign l1d_insn_commit_we = addr_hit[16] & reg_we & !reg_error;
+  assign l1d_insn_commit_we = addr_hit[17] & reg_we & !reg_error;
   assign l1d_insn_commit_wd = reg_wdata[0];
 
-  assign l1d_flush_status_re = addr_hit[17] & reg_re & !reg_error;
+  assign l1d_flush_status_re = addr_hit[18] & reg_re & !reg_error;
 
   // Read data return
   always_comb begin
@@ -2681,30 +2716,34 @@ module spatz_cluster_peripheral_reg_top #(
       end
 
       addr_hit[11]: begin
-        reg_rdata_next[31:0] = cluster_boot_control_qs;
+        reg_rdata_next[31:0] = spatz_cycle_qs;
       end
 
       addr_hit[12]: begin
-        reg_rdata_next[0] = cluster_eoc_exit_qs;
+        reg_rdata_next[31:0] = cluster_boot_control_qs;
       end
 
       addr_hit[13]: begin
-        reg_rdata_next[9:0] = cfg_l1d_spm_qs;
+        reg_rdata_next[0] = cluster_eoc_exit_qs;
       end
 
       addr_hit[14]: begin
-        reg_rdata_next[1:0] = cfg_l1d_insn_qs;
+        reg_rdata_next[9:0] = cfg_l1d_spm_qs;
       end
 
       addr_hit[15]: begin
-        reg_rdata_next[0] = l1d_spm_commit_qs;
+        reg_rdata_next[1:0] = cfg_l1d_insn_qs;
       end
 
       addr_hit[16]: begin
-        reg_rdata_next[0] = l1d_insn_commit_qs;
+        reg_rdata_next[0] = l1d_spm_commit_qs;
       end
 
       addr_hit[17]: begin
+        reg_rdata_next[0] = l1d_insn_commit_qs;
+      end
+
+      addr_hit[18]: begin
         reg_rdata_next[0] = l1d_flush_status_qs;
       end
 
diff --git a/hw/system/spatz_cluster/test/bootrom.bin b/hw/system/spatz_cluster/test/bootrom.bin
index 25b664e..94fc89b 100755
Binary files a/hw/system/spatz_cluster/test/bootrom.bin and b/hw/system/spatz_cluster/test/bootrom.bin differ
diff --git a/hw/system/spatz_cluster/test/bootrom.dump b/hw/system/spatz_cluster/test/bootrom.dump
index 311a50d..6cda4b2 100644
--- a/hw/system/spatz_cluster/test/bootrom.dump
+++ b/hw/system/spatz_cluster/test/bootrom.dump
@@ -6,64 +6,68 @@ Disassembly of section .text:
 
 00001000 <_start>:
     1000:	00000317          	auipc	t1,0x0
-    1004:	07032303          	lw	t1,112(t1) # 1070 <_GLOBAL_OFFSET_TABLE_+0x4>
+    1004:	07832303          	lw	t1,120(t1) # 1078 <_GLOBAL_OFFSET_TABLE_+0x4>
     1008:	30531073          	csrw	mtvec,t1
     100c:	f1402573          	csrr	a0,mhartid
     1010:	00000597          	auipc	a1,0x0
-    1014:	0645a583          	lw	a1,100(a1) # 1074 <_GLOBAL_OFFSET_TABLE_+0x8>
-    1018:	10500073          	wfi
-    101c:	00c5a383          	lw	t2,12(a1)
-    1020:	0105ae03          	lw	t3,16(a1)
-    1024:	01c383b3          	add	t2,t2,t3
-    1028:	05838393          	addi	t2,t2,88
-    102c:	00038393          	mv	t2,t2
-    1030:	0003a383          	lw	t2,0(t2)
-    1034:	00038067          	jr	t2
-
-00001038 <exception>:
-    1038:	10500073          	wfi
-    103c:	ffdff06f          	j	1038 <exception>
+    1014:	06c5a583          	lw	a1,108(a1) # 107c <_GLOBAL_OFFSET_TABLE_+0x8>
+    1018:	3047d073          	csrwi	mie,15
+    101c:	10500073          	wfi
+    1020:	00c5a383          	lw	t2,12(a1)
+    1024:	0105ae03          	lw	t3,16(a1)
+    1028:	01c383b3          	add	t2,t2,t3
+    102c:	06038393          	addi	t2,t2,96
+    1030:	00038393          	mv	t2,t2
+    1034:	0003a383          	lw	t2,0(t2)
+    1038:	00038067          	jr	t2
+
+0000103c <exception>:
+    103c:	10500073          	wfi
+    1040:	ffdff06f          	j	103c <exception>
 
 Disassembly of section .rodata:
 
-00001040 <BOOTDATA>:
-    1040:	1000                	.2byte	0x1000
-    1042:	0000                	.2byte	0x0
-    1044:	0002                	.2byte	0x2
-    1046:	0000                	.2byte	0x0
-    1048:	0010                	.2byte	0x10
+00001048 <BOOTDATA>:
+    1048:	1000                	.2byte	0x1000
     104a:	0000                	.2byte	0x0
-    104c:	0000                	.2byte	0x0
-    104e:	5100                	.2byte	0x5100
-    1050:	8000                	.2byte	0x8000
+    104c:	0002                	.2byte	0x2
+    104e:	0000                	.2byte	0x0
+    1050:	0010                	.2byte	0x10
+    1052:	0000                	.2byte	0x0
+    1054:	0000                	.2byte	0x0
+    1056:	5100                	.2byte	0x5100
+    1058:	0000                	.2byte	0x0
+    105a:	0002                	.2byte	0x2
+    105c:	0000                	.2byte	0x0
+    105e:	0000                	.2byte	0x0
+    1060:	0000                	.2byte	0x0
+    1062:	8000                	.2byte	0x8000
 	...
-    105a:	8000                	.2byte	0x8000
-	...
-    1064:	0001                	.2byte	0x1
+    106c:	0001                	.2byte	0x1
 	...
 
 Disassembly of section .boot_section:
 
-00001068 <entry_addr>:
-    1068:	1038                	.2byte	0x1038
+00001070 <entry_addr>:
+    1070:	103c                	.2byte	0x103c
 	...
 
 Disassembly of section .got:
 
-0000106c <_GLOBAL_OFFSET_TABLE_>:
-    106c:	0000                	.2byte	0x0
-    106e:	0000                	.2byte	0x0
-    1070:	1038                	.2byte	0x1038
-    1072:	0000                	.2byte	0x0
-    1074:	1040                	.2byte	0x1040
+00001074 <_GLOBAL_OFFSET_TABLE_>:
+    1074:	0000                	.2byte	0x0
+    1076:	0000                	.2byte	0x0
+    1078:	103c                	.2byte	0x103c
+    107a:	0000                	.2byte	0x0
+    107c:	1048                	.2byte	0x1048
 	...
 
 Disassembly of section .got.plt:
 
-00001078 <.got.plt>:
-    1078:	ffff                	.2byte	0xffff
-    107a:	ffff                	.2byte	0xffff
-    107c:	0000                	.2byte	0x0
+00001080 <.got.plt>:
+    1080:	ffff                	.2byte	0xffff
+    1082:	ffff                	.2byte	0xffff
+    1084:	0000                	.2byte	0x0
 	...
 
 Disassembly of section .riscv.attributes:
diff --git a/hw/system/spatz_cluster/test/bootrom.elf b/hw/system/spatz_cluster/test/bootrom.elf
index 51a9616..7075551 100755
Binary files a/hw/system/spatz_cluster/test/bootrom.elf and b/hw/system/spatz_cluster/test/bootrom.elf differ
diff --git a/sw/snRuntime/include/spatz_cluster_peripheral.h b/sw/snRuntime/include/spatz_cluster_peripheral.h
index 508f7ca..8738c32 100644
--- a/sw/snRuntime/include/spatz_cluster_peripheral.h
+++ b/sw/snRuntime/include/spatz_cluster_peripheral.h
@@ -181,8 +181,17 @@ extern "C" {
 #define SPATZ_CLUSTER_PERIPHERAL_SPATZ_STATUS_REG_OFFSET 0x50
 #define SPATZ_CLUSTER_PERIPHERAL_SPATZ_STATUS_SPATZ_CLUSTER_PROBE_BIT 0
 
+// Store cycle counts of kernels
+#define SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE_REG_OFFSET 0x58
+#define SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE_SPATZ_CYC_MASK 0xffffffff
+#define SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE_SPATZ_CYC_OFFSET 0
+#define SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE_SPATZ_CYC_FIELD                   \
+  ((bitfield_field32_t){                                                       \
+      .mask = SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE_SPATZ_CYC_MASK,             \
+      .index = SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE_SPATZ_CYC_OFFSET})
+
 // Controls the cluster boot process.
-#define SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL_REG_OFFSET 0x58
+#define SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL_REG_OFFSET 0x60
 #define SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL_ENTRY_POINT_MASK         \
   0xffffffff
 #define SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL_ENTRY_POINT_OFFSET 0
@@ -193,11 +202,11 @@ extern "C" {
           SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL_ENTRY_POINT_OFFSET})
 
 // End of computation and exit status register
-#define SPATZ_CLUSTER_PERIPHERAL_CLUSTER_EOC_EXIT_REG_OFFSET 0x60
+#define SPATZ_CLUSTER_PERIPHERAL_CLUSTER_EOC_EXIT_REG_OFFSET 0x68
 #define SPATZ_CLUSTER_PERIPHERAL_CLUSTER_EOC_EXIT_EOC_EXIT_BIT 0
 
 // Controls the configurations of L1 DCache SPM size.
-#define SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM_REG_OFFSET 0x68
+#define SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM_REG_OFFSET 0x70
 #define SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM_SPM_SIZE_MASK 0x3ff
 #define SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM_SPM_SIZE_OFFSET 0
 #define SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM_SPM_SIZE_FIELD                    \
@@ -206,7 +215,7 @@ extern "C" {
       .index = SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_SPM_SPM_SIZE_OFFSET})
 
 // Controls the L1 DCache flushing and invalidation.
-#define SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN_REG_OFFSET 0x70
+#define SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN_REG_OFFSET 0x78
 #define SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN_INSN_MASK 0x3
 #define SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN_INSN_OFFSET 0
 #define SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN_INSN_FIELD                       \
@@ -215,15 +224,15 @@ extern "C" {
       .index = SPATZ_CLUSTER_PERIPHERAL_CFG_L1D_INSN_INSN_OFFSET})
 
 // Controls the L1 DCache flushing and invalidation.
-#define SPATZ_CLUSTER_PERIPHERAL_L1D_SPM_COMMIT_REG_OFFSET 0x78
+#define SPATZ_CLUSTER_PERIPHERAL_L1D_SPM_COMMIT_REG_OFFSET 0x80
 #define SPATZ_CLUSTER_PERIPHERAL_L1D_SPM_COMMIT_COMMIT_BIT 0
 
 // Controls the L1 DCache flushing and invalidation.
-#define SPATZ_CLUSTER_PERIPHERAL_L1D_INSN_COMMIT_REG_OFFSET 0x80
+#define SPATZ_CLUSTER_PERIPHERAL_L1D_INSN_COMMIT_REG_OFFSET 0x88
 #define SPATZ_CLUSTER_PERIPHERAL_L1D_INSN_COMMIT_COMMIT_BIT 0
 
 // Indicate the status of flushing
-#define SPATZ_CLUSTER_PERIPHERAL_L1D_FLUSH_STATUS_REG_OFFSET 0x88
+#define SPATZ_CLUSTER_PERIPHERAL_L1D_FLUSH_STATUS_REG_OFFSET 0x90
 #define SPATZ_CLUSTER_PERIPHERAL_L1D_FLUSH_STATUS_STATUS_BIT 0
 
 #ifdef __cplusplus
diff --git a/sw/spatzBenchmarks/benchmark/benchmark.c b/sw/spatzBenchmarks/benchmark/benchmark.c
index 5094acd..cbb136e 100644
--- a/sw/spatzBenchmarks/benchmark/benchmark.c
+++ b/sw/spatzBenchmarks/benchmark/benchmark.c
@@ -17,11 +17,11 @@ void start_kernel() {
       (uint32_t *)(_snrt_team_current->root->cluster_mem.end +
                    SPATZ_CLUSTER_PERIPHERAL_SPATZ_STATUS_REG_OFFSET);
   *bench = 1;
-  snrt_start_perf_counter(SNRT_PERF_CNT0, SNRT_PERF_CNT_CYCLES, 0);
+  // snrt_start_perf_counter(SNRT_PERF_CNT0, SNRT_PERF_CNT_CYCLES, 0);
 }
 
 void stop_kernel() {
-  snrt_stop_perf_counter(SNRT_PERF_CNT0);
+  // snrt_stop_perf_counter(SNRT_PERF_CNT0);
   uint32_t *bench =
       (uint32_t *)(_snrt_team_current->root->cluster_mem.end +
                    SPATZ_CLUSTER_PERIPHERAL_SPATZ_STATUS_REG_OFFSET);
@@ -32,8 +32,17 @@ void stop_kernel() {
 size_t get_perf() {
   volatile uint32_t *perf =
       (uint32_t *)(_snrt_team_current->root->cluster_mem.end +
-                   SPATZ_CLUSTER_PERIPHERAL_PERF_COUNTER_0_REG_OFFSET);
+                   SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE_REG_OFFSET);
   // There is a constant delay of using performance counter for cycle recording
   // substract the constant delay
-  return (*perf-60);
+  return (*perf);
+}
+
+void write_cyc(uint32_t cyc) {
+  volatile uint32_t *perf =
+      (uint32_t *)(_snrt_team_current->root->cluster_mem.end +
+                   SPATZ_CLUSTER_PERIPHERAL_SPATZ_CYCLE_REG_OFFSET);
+  // There is a constant delay of using performance counter for cycle recording
+  // substract the constant delay
+  *perf = cyc;
 }
diff --git a/sw/spatzBenchmarks/dp-faxpy/main.c b/sw/spatzBenchmarks/dp-faxpy/main.c
index eb80aac..0cfb04c 100644
--- a/sw/spatzBenchmarks/dp-faxpy/main.c
+++ b/sw/spatzBenchmarks/dp-faxpy/main.c
@@ -61,6 +61,7 @@ int main() {
 
   // Reset timer
   unsigned int timer = (unsigned int)-1;
+  uint32_t timer_start, timer_end;
 
   const unsigned int dim = axpy_l.M;
   const unsigned int dim_core = dim / num_cores;
@@ -99,19 +100,24 @@ int main() {
   if (cid == 0)
     start_kernel();
 
+  timer_start = benchmark_get_cycle();
+
   // Call AXPY
   faxpy_v64b(*a, x_int, y_int, dim_core);
 
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
 
+  timer_end = benchmark_get_cycle();
+
   // End dump
   if (cid == 0)
     stop_kernel();
 
   // Check and display results
   if (cid == 0) {
-    timer = get_perf();
+    timer = timer_end - timer_start;
+    write_cyc(timer);
     long unsigned int performance = 1000 * 2 * dim / timer;
     long unsigned int utilization = performance / (2 * num_cores * 4);
   #ifdef PRINT_CHECK
diff --git a/sw/spatzBenchmarks/dp-fdotp/main.c b/sw/spatzBenchmarks/dp-fdotp/main.c
index e59e1da..015490a 100644
--- a/sw/spatzBenchmarks/dp-fdotp/main.c
+++ b/sw/spatzBenchmarks/dp-fdotp/main.c
@@ -23,7 +23,8 @@
 #include DATAHEADER
 #include "kernel/fdotp.c"
 
-#define USE_CACHE
+// #define USE_CACHE
+// #define ENABLE_PRINT
 
 double *a;
 double *b;
@@ -136,6 +137,7 @@ int main() {
     if (cid == 0) {
       timer_tmp = benchmark_get_cycle() - timer_tmp;
       timer = (timer < timer_tmp) ? timer : timer_tmp;
+      write_cyc(timer);
     }
 
     snrt_cluster_hw_barrier();
@@ -145,16 +147,19 @@ int main() {
   if (cid == 0) {
     long unsigned int performance = 1000 * 2 * dotp_l.M / timer;
     long unsigned int utilization = performance / (2 * num_cores * 4);
-
+  #ifdef ENABLE_PRINT
     printf("\n----- (%d) dp fdotp -----\n", dotp_l.M);
     printf("The execution took %u cycles.\n", timer);
     printf("The performance is %ld OP/1000cycle (%ld%%o utilization).\n",
            performance, utilization);
+  #endif
   }
 
   if (cid == 0)
     if (fp_check(result[0], dotp_result*measure_iter)) {
+    #ifdef ENABLE_PRINT
       printf("Error: Result = %f, Golden = %f\n", result[0], dotp_result*measure_iter);
+    #endif
       return -1;
     }
 
diff --git a/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c b/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c
index 2666e41..dc193be 100644
--- a/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c
+++ b/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c
@@ -23,7 +23,8 @@
 #include DATAHEADER
 #include "kernel/dp-fmatmul.c"
 
-#define USE_CACHE
+// #define USE_CACHE
+// #define ENABLE_PRINT
 
 #ifndef KERNEL_SIZE
 #define KERNEL_SIZE 4
@@ -162,11 +163,13 @@ int main() {
     long unsigned int performance =
         1000 * 2 * gemm_l.M * gemm_l.N * gemm_l.K / timer;
     long unsigned int utilization = performance / (2 * num_cores * 4);
-
+    write_cyc(timer);
+  #ifdef ENABLE_PRINT
     printf("\n----- (%dx%d) dp fmatmul -----\n", gemm_l.M, gemm_l.N);
     printf("The execution took %u cycles.\n", timer);
     printf("The performance is %ld OP/1000cycle (%ld%%o utilization).\n",
            performance, utilization);
+  #endif
   }
 
   if (cid == 0) {
@@ -174,8 +177,10 @@ int main() {
         verify_matrix(c, (const double *)gemm_checksum, gemm_l.M, gemm_l.N);
 
     if (error != 0) {
+    #ifdef ENABLE_PRINT
       printf("Error core %d: c[%d]=%u\n", cid, error, (int)c[error]);
       return error;
+    #endif
     }
   }
 
diff --git a/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c b/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c
index 5e6b339..57ba387 100644
--- a/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c
+++ b/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c
@@ -23,7 +23,7 @@
 #include DATAHEADER
 #include "kernel/dp-fmatmul.c"
 
-// #define USE_CACHE
+#define USE_CACHE
 
 #ifndef KERNEL_SIZE
 #define KERNEL_SIZE 8
@@ -149,6 +149,7 @@ int main() {
     if (cid == 0) {
       if (timer_temp < timer) {
         timer = timer_temp;
+        write_cyc(timer);
       }
     }
 
@@ -161,9 +162,9 @@ int main() {
     long unsigned int performance =
         1000 * 2 * gemm_l.M * gemm_l.N * gemm_l.K / timer;
     long unsigned int utilization = performance / (2 * num_cores * 4);
-
+    uint32_t cyc = get_perf();
     printf("\n----- (%dx%d) dp fmatmul -----\n", gemm_l.M, gemm_l.N);
-    printf("The execution took %u cycles.\n", timer);
+    printf("The execution took %u/%u cycles.\n", timer, cyc);
     printf("The performance is %ld OP/1000cycle (%ld%%o utilization).\n",
            performance, utilization);
   }
diff --git a/sw/spatzBenchmarks/include/benchmark.h b/sw/spatzBenchmarks/include/benchmark.h
index 888b71b..cf3af09 100644
--- a/sw/spatzBenchmarks/include/benchmark.h
+++ b/sw/spatzBenchmarks/include/benchmark.h
@@ -12,3 +12,4 @@ size_t benchmark_get_cycle();
 void start_kernel();
 void stop_kernel();
 size_t get_perf();
+void write_cyc(uint32_t cyc);