-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy patht266.ptx
87 lines (80 loc) · 2.03 KB
/
t266.ptx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21124049
// Cuda compilation tools, release 8.0, V8.0.44
// Based on LLVM 3.4svn
//
.version 5.0
.target sm_20
.address_size 64
// .globl _Z3dotPfS_S_
.extern .func (.param .b32 func_retval0) vprintf
(
.param .b64 vprintf_param_0,
.param .b64 vprintf_param_1
)
;
.global .align 1 .b8 $str[3] = {37, 100, 0};
.visible .entry _Z3dotPfS_S_(
.param .u64 _Z3dotPfS_S__param_0,
.param .u64 _Z3dotPfS_S__param_1,
.param .u64 _Z3dotPfS_S__param_2
)
{
.local .align 8 .b8 __local_depot0[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .f32 %f<4>;
.reg .b32 %r<10>, r1, r2;
.reg .b64 %rd<16>;
.reg .b64 %r_sim_ld<3>; .reg .b32 %r_ssim_ld;
.reg .b16 %t<10>;
mov.u64 %rd15, __local_depot0;
cvta.local.u64 %SP, %rd15;
ld.param.u64 %rd1, [_Z3dotPfS_S__param_0];
ld.param.u64 %rd2, [_Z3dotPfS_S__param_1];
ld.param.u64 %rd3, [_Z3dotPfS_S__param_2];
cvta.to.global.u64 %rd4, %rd3;
cvta.to.global.u64 %rd5, %rd2;
cvta.to.global.u64 %rd6, %rd1;
add.u64 %rd7, %SP, 0;
cvta.to.local.u64 %rd8, %rd7;
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %ntid.x;
mad.lo.s32 %r4, %r3, %r2, %r1;
mul.wide.s32 %rd9, %r4, 4;
add.s64 %rd10, %rd6, %rd9;
ld.global.f32 %f1, [%rd10];
cvt.rzi.s32.f32 %r5, %f1;
add.s64 %rd11, %rd5, %rd9;
ld.global.f32 %f2, [%rd11];
cvt.rzi.s32.f32 %r6, %f2;
mov.s32 %r4,0; mov.s32 %r5,0; mov.s32 %r6,0; mov.s32 %r7,0; mov.s32 %r0,0; mov.s32 %r1,0; mov.s32 r2,0; mov.s32 %r3,0; mov.s32 %r8,0; mov.s32 %r2,0; mov.s32 r1,1048576; mul.wide.s32 %rd0, r1, %r2;
st.local.u32 [%rd8], %rd0;
mov.u64 %rd12, $str;
cvta.global.u64 %rd13, %rd12;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd13;
.param .b64 param1;
st.param.b64 [param1+0], %rd7;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r8, [retval0+0];
//{
}// Callseq End 0
cvt.rn.f32.s32 %f3, %r7;
add.s64 %rd14, %rd4, %rd9;
st.global.f32 [%rd14], %f3;
ret;
}