forked from tecracoin/ccminer
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
import ccminer version with mtp support
- Loading branch information
Showing
522 changed files
with
285,481 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
/** | ||
* bmw-256 MDT | ||
* tpruvot - 2015 | ||
*/ | ||
extern "C" { | ||
#include "sph/sph_bmw.h" | ||
} | ||
|
||
#include <miner.h> | ||
#include <cuda_helper.h> | ||
|
||
static uint32_t *d_hash[MAX_GPUS]; | ||
|
||
extern void bmw256_midstate_init(int thr_id, uint32_t threads); | ||
extern void bmw256_midstate_free(int thr_id); | ||
extern void bmw256_setBlock_80(int thr_id, void *pdata); | ||
extern void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int swap); | ||
|
||
extern uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash); | ||
|
||
// CPU Hash | ||
extern "C" void bmw_hash(void *state, const void *input) | ||
{ | ||
uint32_t _ALIGN(64) hash[16]; | ||
sph_bmw256_context ctx; | ||
|
||
sph_bmw256_init(&ctx); | ||
sph_bmw256(&ctx, input, 80); | ||
sph_bmw256_close(&ctx, (void*) hash); | ||
|
||
memcpy(state, hash, 32); | ||
} | ||
|
||
static bool init[MAX_GPUS] = { 0 }; | ||
|
||
static __inline uint32_t swab32_if(uint32_t val, bool iftrue) { | ||
return iftrue ? swab32(val) : val; | ||
} | ||
|
||
extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) | ||
{ | ||
uint32_t _ALIGN(64) endiandata[20]; | ||
uint32_t *pdata = work->data; | ||
uint32_t *ptarget = work->target; | ||
const uint32_t first_nonce = pdata[19]; | ||
bool swapnonce = true; | ||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21); | ||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); | ||
|
||
if (opt_benchmark) | ||
ptarget[7] = 0x0005; | ||
|
||
if (!init[thr_id]) { | ||
cudaSetDevice(device_map[thr_id]); | ||
if (opt_cudaschedule == -1 && gpu_threads == 1) { | ||
cudaDeviceReset(); | ||
// reduce cpu usage | ||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); | ||
} | ||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); | ||
|
||
cuda_check_cpu_init(thr_id, throughput); | ||
bmw256_midstate_init(thr_id, throughput); | ||
|
||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); | ||
|
||
init[thr_id] = true; | ||
} | ||
|
||
for (int k=0; k < 20; k++) { | ||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); | ||
} | ||
|
||
cudaGetLastError(); | ||
bmw256_setBlock_80(thr_id, (void*)endiandata); | ||
|
||
cuda_check_cpu_setTarget(ptarget); | ||
|
||
do { | ||
bmw256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], (int) swapnonce); | ||
uint32_t foundNonce = cuda_check_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]); | ||
|
||
*hashes_done = pdata[19] - first_nonce + throughput; | ||
|
||
if (foundNonce != UINT32_MAX) | ||
{ | ||
uint32_t _ALIGN(64) vhash64[8]; | ||
endiandata[19] = swab32_if(foundNonce, swapnonce); | ||
bmw_hash(vhash64, endiandata); | ||
|
||
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { | ||
pdata[19] = swab32_if(foundNonce,!swapnonce); | ||
work_set_target_ratio(work, vhash64); | ||
return 1; | ||
} | ||
else { | ||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); | ||
} | ||
} | ||
|
||
if ((uint64_t) throughput + pdata[19] >= max_nonce) { | ||
pdata[19] = max_nonce; | ||
break; | ||
} | ||
|
||
pdata[19] += throughput; | ||
|
||
} while (!work_restart[thr_id].restart); | ||
|
||
*hashes_done = pdata[19] - first_nonce; | ||
return 0; | ||
} | ||
|
||
// cleanup | ||
extern "C" void free_bmw(int thr_id) | ||
{ | ||
if (!init[thr_id]) | ||
return; | ||
|
||
cudaThreadSynchronize(); | ||
|
||
cudaFree(d_hash[thr_id]); | ||
bmw256_midstate_free(thr_id); | ||
cuda_check_cpu_free(thr_id); | ||
|
||
cudaDeviceSynchronize(); | ||
init[thr_id] = false; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,252 @@ | ||
/** | ||
* Blake-256 Cuda Kernel (Tested on SM 5.0) | ||
* | ||
* Tanguy Pruvot - Nov. 2014 | ||
*/ | ||
extern "C" { | ||
#include "sph/sph_blake.h" | ||
} | ||
|
||
#include "cuda_helper.h" | ||
|
||
#include <memory.h> | ||
|
||
static __device__ uint64_t cuda_swab32ll(uint64_t x) { | ||
return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x))); | ||
} | ||
|
||
__constant__ static uint32_t c_data[3+1]; | ||
|
||
__constant__ static uint32_t sigma[16][16]; | ||
static uint32_t c_sigma[16][16] = { | ||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, | ||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, | ||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, | ||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, | ||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, | ||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, | ||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, | ||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, | ||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, | ||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, | ||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, | ||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, | ||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, | ||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, | ||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, | ||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } | ||
}; | ||
|
||
static const uint32_t c_IV256[8] = { | ||
0x6A09E667, 0xBB67AE85, | ||
0x3C6EF372, 0xA54FF53A, | ||
0x510E527F, 0x9B05688C, | ||
0x1F83D9AB, 0x5BE0CD19 | ||
}; | ||
|
||
__device__ __constant__ static uint32_t cpu_h[8]; | ||
|
||
__device__ __constant__ static uint32_t u256[16]; | ||
static const uint32_t c_u256[16] = { | ||
0x243F6A88, 0x85A308D3, | ||
0x13198A2E, 0x03707344, | ||
0xA4093822, 0x299F31D0, | ||
0x082EFA98, 0xEC4E6C89, | ||
0x452821E6, 0x38D01377, | ||
0xBE5466CF, 0x34E90C6C, | ||
0xC0AC29B7, 0xC97C50DD, | ||
0x3F84D5B5, 0xB5470917 | ||
}; | ||
|
||
|
||
|
||
#define GS2(a,b,c,d,x) { \ | ||
const uint32_t idx1 = sigma[r][x]; \ | ||
const uint32_t idx2 = sigma[r][x + 1]; \ | ||
v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \ | ||
v[d] = ROTR32(v[d] ^ v[a], 16); \ | ||
v[c] += v[d]; \ | ||
v[b] = ROTR32(v[b] ^ v[c], 12); \ | ||
\ | ||
v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \ | ||
v[d] = ROTR32(v[d] ^ v[a], 8); \ | ||
v[c] += v[d]; \ | ||
v[b] = ROTR32(v[b] ^ v[c], 7); \ | ||
} | ||
|
||
|
||
|
||
//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) | ||
//#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) | ||
#define hostGS(a,b,c,d,x) { \ | ||
const uint8_t idx1 = c_sigma[r][x]; \ | ||
const uint8_t idx2 = c_sigma[r][x + 1]; \ | ||
v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \ | ||
v[d] = ROTR32(v[d] ^ v[a], 16); \ | ||
v[c] += v[d]; \ | ||
v[b] = ROTR32(v[b] ^ v[c], 12); \ | ||
\ | ||
v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \ | ||
v[d] = ROTR32(v[d] ^ v[a], 8); \ | ||
v[c] += v[d]; \ | ||
v[b] = ROTR32(v[b] ^ v[c], 7); \ | ||
} | ||
|
||
|
||
/* Second part (64-80) msg never change, store it */ | ||
__device__ __constant__ static const uint32_t c_Padding[16] = { | ||
0, 0, 0, 0, | ||
0x80000000, 0, 0, 0, | ||
0, 0, 0, 0, | ||
0, 1, 0, 640, | ||
}; | ||
|
||
__host__ __forceinline__ | ||
static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint32_t T0) | ||
{ | ||
uint32_t m[16]; | ||
uint32_t v[16]; | ||
|
||
for (int i = 0; i < 16; i++) { | ||
m[i] = block[i]; | ||
} | ||
|
||
for (int i = 0; i < 8; i++) | ||
v[i] = h[i]; | ||
|
||
v[8] = c_u256[0]; | ||
v[9] = c_u256[1]; | ||
v[10] = c_u256[2]; | ||
v[11] = c_u256[3]; | ||
|
||
v[12] = c_u256[4] ^ T0; | ||
v[13] = c_u256[5] ^ T0; | ||
v[14] = c_u256[6]; | ||
v[15] = c_u256[7]; | ||
|
||
for (int r = 0; r < 14; r++) { | ||
/* column step */ | ||
hostGS(0, 4, 0x8, 0xC, 0x0); | ||
hostGS(1, 5, 0x9, 0xD, 0x2); | ||
hostGS(2, 6, 0xA, 0xE, 0x4); | ||
hostGS(3, 7, 0xB, 0xF, 0x6); | ||
/* diagonal step */ | ||
hostGS(0, 5, 0xA, 0xF, 0x8); | ||
hostGS(1, 6, 0xB, 0xC, 0xA); | ||
hostGS(2, 7, 0x8, 0xD, 0xC); | ||
hostGS(3, 4, 0x9, 0xE, 0xE); | ||
} | ||
|
||
for (int i = 0; i < 16; i++) { | ||
int j = i & 7; | ||
h[j] ^= v[i]; | ||
} | ||
} | ||
|
||
__device__ __forceinline__ | ||
static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0) | ||
{ | ||
uint32_t m[16]; | ||
uint32_t v[16]; | ||
|
||
m[0] = block[0]; | ||
m[1] = block[1]; | ||
m[2] = block[2]; | ||
m[3] = block[3]; | ||
|
||
#pragma unroll | ||
for (int i = 4; i < 16; i++) { | ||
m[i] = c_Padding[i]; | ||
} | ||
|
||
#pragma unroll 8 | ||
for (int i = 0; i < 8; i++) | ||
v[i] = h[i]; | ||
|
||
v[8] = u256[0]; | ||
v[9] = u256[1]; | ||
v[10] = u256[2]; | ||
v[11] = u256[3]; | ||
|
||
v[12] = u256[4] ^ T0; | ||
v[13] = u256[5] ^ T0; | ||
v[14] = u256[6]; | ||
v[15] = u256[7]; | ||
|
||
#pragma unroll 14 | ||
for (int r = 0; r < 14; r++) { | ||
/* column step */ | ||
GS2(0, 4, 0x8, 0xC, 0x0); | ||
GS2(1, 5, 0x9, 0xD, 0x2); | ||
GS2(2, 6, 0xA, 0xE, 0x4); | ||
GS2(3, 7, 0xB, 0xF, 0x6); | ||
/* diagonal step */ | ||
GS2(0, 5, 0xA, 0xF, 0x8); | ||
GS2(1, 6, 0xB, 0xC, 0xA); | ||
GS2(2, 7, 0x8, 0xD, 0xC); | ||
GS2(3, 4, 0x9, 0xE, 0xE); | ||
} | ||
|
||
#pragma unroll 16 | ||
for (int i = 0; i < 16; i++) { | ||
int j = i & 7; | ||
h[j] ^= v[i]; | ||
} | ||
} | ||
|
||
__global__ __launch_bounds__(256,3) | ||
void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash) | ||
{ | ||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); | ||
if (thread < threads) | ||
{ | ||
uint32_t h[8]; | ||
uint32_t input[4]; | ||
|
||
#pragma unroll | ||
for (int i = 0; i < 8; i++) h[i] = cpu_h[i]; | ||
|
||
#pragma unroll | ||
for (int i = 0; i < 3; ++i) input[i] = c_data[i]; | ||
|
||
input[3] = startNonce + thread; | ||
blake256_compress2nd(h, input, 640); | ||
|
||
#pragma unroll | ||
for (int i = 0; i<4; i++) { | ||
Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1])); | ||
} | ||
} | ||
} | ||
|
||
__host__ | ||
void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order) | ||
{ | ||
const uint32_t threadsperblock = 256; | ||
|
||
dim3 grid((threads + threadsperblock - 1) / threadsperblock); | ||
dim3 block(threadsperblock); | ||
|
||
blake256_gpu_hash_80 <<<grid, block>>> (threads, startNonce, Hash); | ||
MyStreamSynchronize(NULL, order, thr_id); | ||
} | ||
|
||
__host__ | ||
void blake256_cpu_setBlock_80(uint32_t *pdata) | ||
{ | ||
uint32_t h[8], data[20]; | ||
|
||
memcpy(data, pdata, 80); | ||
memcpy(h, c_IV256, sizeof(c_IV256)); | ||
blake256_compress1st(h, pdata, 512); | ||
|
||
cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice); | ||
cudaMemcpyToSymbol(c_data, &data[16], sizeof(c_data), 0, cudaMemcpyHostToDevice); | ||
} | ||
|
||
__host__ | ||
void blake256_cpu_init(int thr_id, uint32_t threads) | ||
{ | ||
cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice); | ||
cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice); | ||
} |
Oops, something went wrong.