This repository has been archived by the owner on Jan 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 165
/
Copy pathcublas_sgemm.ptx
65 lines (52 loc) · 1.77 KB
/
cublas_sgemm.ptx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
.version 4.1
.target sm_50
.address_size 64
// ptxas -v -arch=sm_50 -m 32 --opt-level 4 -o cublas_sgemm.cubin cublas_sgemm.ptx
// You can use maxas to insert cublas_device.lib code into a cubin built from this ptx:
// From C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32\cublas_device.lib
// cuobjdump -lelf cublas_device.lib | find "sm_50"
// cuobjdump -xelf maxwell_sgemm.asm.sm_50.cubin cublas_device.lib
// maxas -l maxwell_sgemm.asm.sm_50.cubin
// maxas -e -k maxwell_sgemm_128x128_nt maxwell_sgemm_128x128_nt.sass
// maxas -e -k maxwell_sgemm_128x64_nt maxwell_sgemm_128x64_nt.sass
// maxas -i maxwell_sgemm_128x128_nt.sass cublas_sgemm.cubin
// maxas -i maxwell_sgemm_128x64_nt.sass cublas_sgemm.cubin
// The sgemm.cpp code makes use of this cubin to benchmark the kernels outside of cublas.
.visible .entry maxwell_sgemm_128x128_nt(
.param .u64 .ptr.global.align 8 param_A,
.param .u64 .ptr.global.align 8 param_B,
.param .u64 .ptr.global.align 8 param_C,
.param .s32 param_lda,
.param .s32 param_ldb,
.param .s32 param_ldc,
.param .s32 param_k,
.param .u64 .ptr.global.align 8 param_Alpha,
.param .u64 .ptr.global.align 8 param_Beta,
.param .s32 param_alpha,
.param .s32 param_beta,
.param .s32 param_flag
)
.reqntid 256
{
.shared .align 16 .b8 share[16384];
ret;
}
.visible .entry maxwell_sgemm_128x64_nt(
.param .u64 .ptr.global.align 8 param_A,
.param .u64 .ptr.global.align 8 param_B,
.param .u64 .ptr.global.align 8 param_C,
.param .s32 param_lda,
.param .s32 param_ldb,
.param .s32 param_ldc,
.param .s32 param_k,
.param .u64 .ptr.global.align 8 param_Alpha,
.param .u64 .ptr.global.align 8 param_Beta,
.param .s32 param_alpha,
.param .s32 param_beta,
.param .s32 param_flag
)
.reqntid 128
{
.shared .align 16 .b8 share[12288];
ret;
}