|
- //
- // Generated by NVIDIA NVVM Compiler
- //
- // Compiler Build ID: CL-26907403
- // Cuda compilation tools, release 10.1, V10.1.243
- // Based on LLVM 3.4svn
- //
-
- .version 6.4
- .target sm_60
- .address_size 64
-
- // .globl Fused_ReLU_723564827244450994_kernel0
-
- .visible .entry Fused_ReLU_723564827244450994_kernel0(
- .param .u64 Fused_ReLU_723564827244450994_kernel0_param_0,
- .param .u64 Fused_ReLU_723564827244450994_kernel0_param_1
- )
- {
- .reg .pred %p<5>;
- .reg .f32 %f<13>;
- .reg .b32 %r<6>;
- .reg .b64 %rd<8>;
-
-
- ld.param.u64 %rd1, [Fused_ReLU_723564827244450994_kernel0_param_0];
- ld.param.u64 %rd2, [Fused_ReLU_723564827244450994_kernel0_param_1];
- cvta.to.global.u64 %rd3, %rd1;
- mov.u32 %r1, %ctaid.x;
- shl.b32 %r2, %r1, 12;
- mov.u32 %r3, %tid.x;
- shl.b32 %r4, %r3, 2;
- add.s32 %r5, %r4, %r2;
- mul.wide.s32 %rd4, %r5, 4;
- add.s64 %rd5, %rd3, %rd4;
- ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd5];
- setp.gt.f32 %p1, %f1, 0f00000000;
- setp.gt.f32 %p2, %f2, 0f00000000;
- setp.gt.f32 %p3, %f3, 0f00000000;
- setp.gt.f32 %p4, %f4, 0f00000000;
- cvta.to.global.u64 %rd6, %rd2;
- add.s64 %rd7, %rd6, %rd4;
- selp.f32 %f9, %f4, 0f00000000, %p4;
- selp.f32 %f10, %f3, 0f00000000, %p3;
- selp.f32 %f11, %f2, 0f00000000, %p2;
- selp.f32 %f12, %f1, 0f00000000, %p1;
- st.global.v4.f32 [%rd7], {%f12, %f11, %f10, %f9};
- ret;
- }
-
|