From a2b2693c544614bff31973af7d16d328037722cd Mon Sep 17 00:00:00 2001
From: fancyIX <fancyIX@yahoo.com>
Date: Fri, 18 Feb 2022 23:50:06 -0800
Subject: [PATCH] Issue #11

Seems working
---
 Makefile.am         |   4 +-
 allium.cu           |  15 +-
 ccminer.vcxproj     |   2 +-
 lyra2/cuda_lyra2.cu | 362 +++++++++++++++++++++++++++++++++++++++++++-
 lyra2/lyra2RE.cu    |   4 +-
 5 files changed, 372 insertions(+), 15 deletions(-)
diff --git a/Makefile.am b/Makefile.am
index 6e041fc4e9..7ed433f99f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -116,10 +116,10 @@ endif
 ccminer_LDADD += -lcuda
 
 nvcc_ARCH :=
-#nvcc_ARCH += -gencode=arch=compute_86,code=\"sm_86,compute_86\"
+nvcc_ARCH += -gencode=arch=compute_86,code=\"sm_86,compute_86\"
 nvcc_ARCH += -gencode=arch=compute_75,code=\"sm_75,compute_75\"
 nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
-nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
+#nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
 #nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
 #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
diff --git a/allium.cu b/allium.cu
index b881a99beb..3301433485 100644
--- a/allium.cu
+++ b/allium.cu
@@ -29,8 +29,8 @@ extern void skein256_cpu_init(int thr_id, uint32_t threads);
 
 extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
 extern void lyra2_cpu_init_high_end(int thr_id, uint32_t threads, uint64_t *g_pad);
-extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti, bool high_end);
-extern void lyra2_cpu_hash_32_fancyIX(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, uint64_t *g_pad, bool gtx750ti, bool high_end);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti, uint32_t high_end);
+extern void lyra2_cpu_hash_32_fancyIX(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, uint64_t *g_pad, bool gtx750ti, uint32_t high_end);
 
 extern void groestl256_cpu_init(int thr_id, uint32_t threads);
 extern void groestl256_cpu_free(int thr_id);
@@ -92,7 +92,7 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce
 		ptarget[7] = 0x0400;
 
 	static __thread bool gtx750ti;
-	static __thread bool high_end;
+	static __thread uint32_t high_end;
 	if (!init[thr_id])
 	{
 		int dev_id = device_map[thr_id];
@@ -111,11 +111,12 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce
 		else gtx750ti = false;
 
 		if (strstr(props.name, "1080") ||
-		    strstr(props.name, "1070") ||
+		    strstr(props.name, "1070")) high_end = 1;
+		if (strstr(props.name, "3090") ||
 		    strstr(props.name, "3080") ||
             strstr(props.name, "3070") ||
-		    strstr(props.name, "3060")) high_end = true;
-		else high_end = false;
+		    strstr(props.name, "3060")) high_end = 2;
+		else high_end = 0;
 
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
@@ -130,7 +131,7 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce
 			size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
 			CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
 			lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
-			if (high_end) {
+			if (high_end == 1) {
 				size_t pad_sz = sizeof(uint64_t) * 8 * 8 * 3 * 4;
 				CUDA_SAFE_CALL(cudaMalloc(&g_pad[thr_id], pad_sz * throughput));
 				lyra2_cpu_init_high_end(thr_id, throughput, g_pad[thr_id]);
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index f8d072a60f..fc3fa75a6d 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -198,7 +198,7 @@
       <MaxRegCount>128</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_75,sm_75;compute_61,sm_61;compute_52,sm_52</CodeGeneration>
+      <CodeGeneration>compute_86,sm_86;compute_75,sm_75;compute_61,sm_61</CodeGeneration>
       <Include>$(NVTOOLSEXT_PATH)\include</Include>
       <Optimization>O3</Optimization>
       <TargetMachinePlatform>64</TargetMachinePlatform>
diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu
index ceb4d1c3e0..9cb8e56ad2 100644
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@@ -34,6 +34,25 @@
  
  __device__ uint2 *DMatrix;
  
+ __device__ __forceinline__ void LD4SSB(uint2 res[3], const int row, const int col, const int thread, const int threads)
+ {
+     extern __shared__ uint2 shared_mem[];
+     const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+
+	res[0] = shared_mem[((s0 + 0) * 8 + threadIdx.y) * 4 + threadIdx.x];
+	res[1] = shared_mem[((s0 + 1) * 8 + threadIdx.y) * 4 + threadIdx.x];
+	res[2] = shared_mem[((s0 + 2) * 8 + threadIdx.y) * 4 + threadIdx.x];
+ }
+ 
+ __device__ __forceinline__ void ST4SSB(const int row, const int col, const uint2 data[3], const int thread, const int threads)
+ {
+     extern __shared__ uint2 shared_mem[];
+     const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+ 
+     shared_mem[((s0 + 0) * 8 + threadIdx.y) * 4 + threadIdx.x] = data[0];
+	 shared_mem[((s0 + 1) * 8 + threadIdx.y) * 4 + threadIdx.x] = data[1];
+	 shared_mem[((s0 + 2) * 8 + threadIdx.y) * 4 + threadIdx.x] = data[2];
+ }
 
  __device__ __forceinline__ void LD4SS(uint2 res[3], const int row, const int col, const int thread, const int threads)
  {
@@ -506,6 +525,297 @@
          state[j] ^= last[j];
  }
 
+ // ================================= big local mem ==========================
+ static __device__ __forceinline__
+ void reduceDuplex_biglocal(uint2 state[4], uint32_t thread, const uint32_t threads)
+ {
+     uint2 state1[3];
+	 uint2 state2[3];
+ 
+
+     for (int i = 0; i < Nrow; i++)
+     {
+         ST4SSB(0, Ncol - i - 1, state, thread, threads);
+ 
+         round_lyra(state);
+     }
+ 
+     for (int i = 0; i < Nrow; i+=2)
+     {
+         LD4SSB(state1, 0, i, thread, threads);
+		 LD4SSB(state2, 0, i + 1, thread, threads);
+		 #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state[j] ^= state1[j];
+ 
+         round_lyra(state);
+ 
+		 #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state1[j] ^= state[j];
+		
+			 #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state[j] ^= state2[j];
+ 
+         round_lyra(state);
+ 
+		 #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state2[j] ^= state[j];
+         ST4SSB(1, Ncol - i - 1, state1, thread, threads);
+		 ST4SSB(1, Ncol - (i + 1) - 1, state2, thread, threads);
+     }
+ }
+ 
+ static __device__ __forceinline__
+ void reduceDuplexRowSetup_biglocal(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads)
+ {
+     uint2 state1[3], state2[3], state3[3], state4[3];
+
+     for (int i = 0; i < Nrow; i+=2)
+     {
+         LD4SSB(state1, rowIn, i, thread, threads);
+		 LD4SSB(state2, rowInOut, i, thread, threads);
+		 LD4SSB(state3, rowIn, i + 1, thread, threads);
+		 LD4SSB(state4, rowInOut, i + 1, thread, threads);
+		 #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state[j] ^= state1[j] + state2[j];
+ 
+         round_lyra(state);
+ 
+         #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state1[j] ^= state[j];
+ 
+         ST4SSB(rowOut, Ncol - i - 1, state1, thread, threads);
+ 
+         // simultaneously receive data from preceding thread and send data to following thread
+         uint2 Data0 = state[0];
+         uint2 Data1 = state[1];
+         uint2 Data2 = state[2];
+         WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+ 
+         if (threadIdx.x == 0)
+         {
+             state2[0] ^= Data2;
+             state2[1] ^= Data0;
+             state2[2] ^= Data1;
+         } else {
+             state2[0] ^= Data0;
+             state2[1] ^= Data1;
+             state2[2] ^= Data2;
+         }
+ 
+         ST4SSB(rowInOut, i, state2, thread, threads);
+
+		//=====================================
+		 #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state[j] ^= state3[j] + state4[j];
+ 
+         round_lyra(state);
+ 
+         #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state3[j] ^= state[j];
+ 
+         ST4SSB(rowOut, Ncol - (i + 1) - 1, state3, thread, threads);
+ 
+         // simultaneously receive data from preceding thread and send data to following thread
+         uint2 Data01 = state[0];
+         uint2 Data11 = state[1];
+         uint2 Data21 = state[2];
+         WarpShuffle3(Data01, Data11, Data21, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+ 
+         if (threadIdx.x == 0)
+         {
+             state4[0] ^= Data21;
+             state4[1] ^= Data01;
+             state4[2] ^= Data11;
+         } else {
+             state4[0] ^= Data01;
+             state4[1] ^= Data11;
+             state4[2] ^= Data21;
+         }
+ 
+         ST4SSB(rowInOut, (i + 1), state4, thread, threads);
+     }
+ }
+ 
+ static __device__ __forceinline__
+ void reduceDuplexRowt_biglocal(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+ {
+     for (int i = 0; i < Nrow; i+=2)
+     {
+         uint2 state1[3], state2[3], state3[3], state4[3];
+ 
+         LD4SSB(state1, rowIn, i, thread, threads);
+         LD4SSB(state2, rowInOut, i, thread, threads);
+		 LD4SSB(state3, rowIn, i + 1, thread, threads);
+         LD4SSB(state4, rowInOut, i + 1, thread, threads);
+ 
+ #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state[j] ^= state1[j] + state2[j];
+ 
+         LD4SSB(state1, rowOut, i, thread, threads);
+
+         round_lyra(state);
+ 
+         // simultaneously receive data from preceding thread and send data to following thread
+         uint2 Data0 = state[0];
+         uint2 Data1 = state[1];
+         uint2 Data2 = state[2];
+         WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+ 
+         if (threadIdx.x == 0)
+         {
+             state2[0] ^= Data2;
+             state2[1] ^= Data0;
+             state2[2] ^= Data1;
+         }
+         else
+         {
+             state2[0] ^= Data0;
+             state2[1] ^= Data1;
+             state2[2] ^= Data2;
+         }
+
+        if (rowInOut != rowOut) {
+             ST4SSB(rowInOut, i, state2, thread, threads);
+                 #pragma unroll
+            for (int j = 0; j < 3; j++)
+                state2[j] = state1[j];
+         }
+
+#pragma unroll
+        for (int j = 0; j < 3; j++)
+            state2[j] ^= state[j];
+
+        ST4SSB(rowOut, i, state2, thread, threads);
+
+		 //======================================
+ 
+ 
+ #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state[j] ^= state3[j] + state4[j];
+ 
+        LD4SSB(state3, rowOut, i + 1, thread, threads);
+
+         round_lyra(state);
+ 
+         // simultaneously receive data from preceding thread and send data to following thread
+         uint2 Data01 = state[0];
+         uint2 Data11 = state[1];
+         uint2 Data21 = state[2];
+         WarpShuffle3(Data01, Data11, Data21, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+ 
+         if (threadIdx.x == 0)
+         {
+             state4[0] ^= Data21;
+             state4[1] ^= Data01;
+             state4[2] ^= Data11;
+         }
+         else
+         {
+             state4[0] ^= Data01;
+             state4[1] ^= Data11;
+             state4[2] ^= Data21;
+         }
+
+         if (rowInOut != rowOut) {
+             ST4SSB(rowInOut, i + 1, state4, thread, threads);
+                 #pragma unroll
+            for (int j = 0; j < 3; j++)
+                state4[j] = state3[j];
+         }
+
+#pragma unroll
+        for (int j = 0; j < 3; j++)
+            state4[j] ^= state[j];
+
+        ST4SSB(rowOut, i + 1, state4, thread, threads);
+     }
+ }
+ 
+ static __device__ __forceinline__
+ void reduceDuplexRowt_8_biglocal(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads)
+ {
+     uint2 state1[3], state2[3], state3[3], state4[3], last[3];
+ 
+     LD4SSB(state1, 2, 0, thread, threads);
+     LD4SSB(last, rowInOut, 0, thread, threads);
+ 
+     #pragma unroll
+     for (int j = 0; j < 3; j++)
+         state[j] ^= state1[j] + last[j];
+ 
+     round_lyra(state);
+ 
+     // simultaneously receive data from preceding thread and send data to following thread
+     uint2 Data0 = state[0];
+     uint2 Data1 = state[1];
+     uint2 Data2 = state[2];
+     WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+ 
+     if (threadIdx.x == 0)
+     {
+         last[0] ^= Data2;
+         last[1] ^= Data0;
+         last[2] ^= Data1;
+     } else {
+         last[0] ^= Data0;
+         last[1] ^= Data1;
+         last[2] ^= Data2;
+     }
+ 
+     if (rowInOut == 5)
+     {
+         #pragma unroll
+         for (int j = 0; j < 3; j++)
+             last[j] ^= state[j];
+     }
+ 
+	 LD4SSB(state1, 2, 1, thread, threads);
+	 LD4SSB(state2, rowInOut, 1, thread, threads);
+
+	 #pragma unroll
+	 for (int j = 0; j < 3; j++)
+		 state[j] ^= state1[j] + state2[j];
+
+	 round_lyra(state);
+
+     for (int i = 2; i < Nrow; i+=2)
+     {
+         LD4SSB(state1, 2, i, thread, threads);
+         LD4SSB(state2, rowInOut, i, thread, threads);
+		 LD4SSB(state3, 2, i + 1, thread, threads);
+         LD4SSB(state4, rowInOut, i + 1, thread, threads);
+ 
+         #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state[j] ^= state1[j] + state2[j];
+ 
+         round_lyra(state);
+
+		 //============================
+ 
+         #pragma unroll
+         for (int j = 0; j < 3; j++)
+             state[j] ^= state3[j] + state4[j];
+ 
+         round_lyra(state);
+     }
+ 
+     #pragma unroll
+     for (int j = 0; j < 3; j++)
+         state[j] ^= last[j];
+ }
+ // ================================= local mem big ==========================
+
  // ================================= high end ==========================
  static __device__ __forceinline__
  void reduceDuplex_high_end(uint2 state[4], uint32_t thread, const uint32_t threads, uint64_t *g_pad)
@@ -908,6 +1218,47 @@
      }
  }
  
+ __global__
+ __launch_bounds__(64, 1)
+ void lyra2_gpu_hash_32_2_biglocal(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+ {
+     const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
+ 
+     if (thread < threads)
+     {
+         uint2 state[4];
+         state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]);
+         state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]);
+         state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]);
+         state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]);
+ 
+         reduceDuplex_biglocal(state, thread, threads);
+         reduceDuplexRowSetup_biglocal(1, 0, 2, state, thread, threads);
+         reduceDuplexRowSetup_biglocal(2, 1, 3, state, thread, threads);
+         reduceDuplexRowSetup_biglocal(3, 0, 4, state, thread, threads);
+         reduceDuplexRowSetup_biglocal(4, 3, 5, state, thread, threads);
+         reduceDuplexRowSetup_biglocal(5, 2, 6, state, thread, threads);
+         reduceDuplexRowSetup_biglocal(6, 1, 7, state, thread, threads);
+
+         uint32_t rowa;
+         uint32_t row = 0;
+         uint32_t pre = 7;
+         for (int i = 0; i < 7; i++) {
+            rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+            reduceDuplexRowt_biglocal(pre, rowa, row, state, thread, threads);
+            pre = row;
+            row = (row + 3) % 8;
+         }
+         rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+         reduceDuplexRowt_8_biglocal(rowa, state, thread, threads);
+ 
+         DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0];
+         DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1];
+         DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2];
+         DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3];
+     }
+ }
+
 __global__
  __launch_bounds__(64, 1)
  void lyra2_gpu_hash_32_2_high_end(uint32_t threads, uint32_t startNounce, uint64_t *g_pad)
@@ -995,12 +1346,12 @@ void lyra2_cpu_init_high_end(int thr_id, uint32_t threads, uint64_t *g_pad)
 }
 
 __host__
-void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti, bool high_end)
+void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti, uint32_t high_end)
 {
 }
 
 __host__
-void lyra2_cpu_hash_32_fancyIX(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, uint64_t *g_pad, bool gtx750ti, bool high_end)
+void lyra2_cpu_hash_32_fancyIX(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, uint64_t *g_pad, bool gtx750ti, uint32_t high_end)
 {
 	int dev_id = device_map[thr_id % MAX_GPUS];
 
@@ -1010,6 +1361,9 @@ void lyra2_cpu_hash_32_fancyIX(int thr_id, uint32_t threads, uint32_t startNounc
 	else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
 	else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
 
+    dim3 grid0((threads * 4 + 32 - 1) / 32);
+	dim3 block0(4, 32 >> 2);
+
 	dim3 grid1((threads * 4 + 64 - 1) / 64);
 	dim3 block1(4, 64 >> 2);
 
@@ -1023,8 +1377,10 @@ void lyra2_cpu_hash_32_fancyIX(int thr_id, uint32_t threads, uint32_t startNounc
 	{
 		lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
 
-        if (high_end)
+        if (high_end == 1)
             lyra2_gpu_hash_32_2_high_end <<< grid1, block1, 12 * (8 - 0) * sizeof(uint2) * 64 >>> (threads, startNounce, g_pad);
+        else if (high_end == 2)
+            lyra2_gpu_hash_32_2_biglocal <<< grid0, block0, 24 * (8 - 0) * sizeof(uint2) * 32 >>> (threads, startNounce, g_pad);
         else
 		    lyra2_gpu_hash_32_2 <<< grid1, block1, 12 * (8 - 0) * sizeof(uint2) * 64 >>> (threads, startNounce, d_hash);
 
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
index 406bc979c1..88a2133e95 100644
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@@ -26,7 +26,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 
 extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
-extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti, bool high_end);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti, uint32_t high_end);
 
 extern void groestl256_cpu_init(int thr_id, uint32_t threads);
 extern void groestl256_cpu_free(int thr_id);
@@ -80,7 +80,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 		ptarget[7] = 0x00ff;
 
 	static __thread bool gtx750ti;
-	static __thread bool high_end;
+	static __thread uint32_t high_end;
 	if (!init[thr_id])
 	{
 		int dev_id = device_map[thr_id];