From 7059213843cb0f4e114d04c529b760cf005c08f2 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Mon, 6 Jan 2025 18:14:41 +0100
Subject: [PATCH] sw/fft: Add more details on perf calculation

---
 sw/spatzBenchmarks/dp-fft/main.c | 21 +++++++++++++++++++++
 sw/spatzBenchmarks/sp-fft/main.c |  1 +
 2 files changed, 22 insertions(+)

diff --git a/sw/spatzBenchmarks/dp-fft/main.c b/sw/spatzBenchmarks/dp-fft/main.c
index 9ff1ebf..8a2acc5 100644
--- a/sw/spatzBenchmarks/dp-fft/main.c
+++ b/sw/spatzBenchmarks/dp-fft/main.c
@@ -114,6 +114,7 @@ int main() {
 
   // Display runtime
   if (cid == 0) {
+    // See the bottom of the file for additional info on the performance calculation
     long unsigned int performance =
         1000 * 5 * NFFT * log2_nfft / timer;
     long unsigned int utilization =
@@ -146,3 +147,23 @@ int main() {
 
   return 0;
 }
+
+// Comments on performance calculation:
+// Number of mathematical operations: 5 * NFFT * log2(NFFT)
+// Each fp-add, fp-mul, fp-sub, fp-div corresponds to a floating-point operation
+// and can be executed in one cycle by each FPU.
+// Instead, macc-like instructions, i.e. multiply-and-accumulate, correspond to two operations
+// and an FPU can compute one macc instruction per cycle, i.e., two fp-operations per cycle in this case.
+
+// Max perf: 5/4 * #FPUs [DP-FLOP/cycle] -> we replace the 5/4 with the equivalent 1250/1000
+// Why the 5/4 factor?
+// If we had only macc-like instructions in the kernel, the max throughput would be 2 DP-FLOP/(cycle)
+// for each FPU.
+// Instead, if we only had non-macc instructions in the kernel, the max throughput would be 1 DP-FLOP/(cycle)
+// for each FPU.
+// In an intermediate case, the value is between 2 and 1 DP-FLOP/(cycle) for each FPU.
+// The kernel loop is composed of 8 vector FP instructions. Two of them are macc-like and six are not.
+// Thus, in each loop and for a vector length of one element (the vector length gets simplified anyway),
+// we have 10 operations executed in 8 cycles in the best case possible with 100% utilization.
+// Therefore, the maximum performance at 100% utilization is 10/8 DP-FLOP/(cycle) per FPU, i.e.,
+// 5/4  DP-FLOP/(cycle) per FPU
diff --git a/sw/spatzBenchmarks/sp-fft/main.c b/sw/spatzBenchmarks/sp-fft/main.c
index 3963085..df3f392 100644
--- a/sw/spatzBenchmarks/sp-fft/main.c
+++ b/sw/spatzBenchmarks/sp-fft/main.c
@@ -113,6 +113,7 @@ int main() {
 
   // Display runtime
   if (cid == 0) {
+    // See the bottom of the file dp-fft/main.c for further info on the performance calculation
     long unsigned int performance =
         1000 * 5 * NFFT * log2_nfft / timer;
     long unsigned int utilization =