|
- //
- // Generated by NVIDIA NVVM Compiler
- //
- // Compiler Build ID: CL-26907403
- // Cuda compilation tools, release 10.1, V10.1.243
- // Based on LLVM 3.4svn
- //
-
- .version 6.4
- .target sm_60
- .address_size 64
-
- // .globl Fused_AddN_17913439732236309768_kernel0
-
- .visible .entry Fused_AddN_17913439732236309768_kernel0(
- .param .u64 Fused_AddN_17913439732236309768_kernel0_param_0,
- .param .u64 Fused_AddN_17913439732236309768_kernel0_param_1,
- .param .u64 Fused_AddN_17913439732236309768_kernel0_param_2,
- .param .u64 Fused_AddN_17913439732236309768_kernel0_param_3
- )
- {
- .reg .pred %p<2>;
- .reg .f32 %f<33>;
- .reg .b32 %r<5>;
- .reg .b64 %rd<14>;
-
-
- ld.param.u64 %rd1, [Fused_AddN_17913439732236309768_kernel0_param_0];
- ld.param.u64 %rd2, [Fused_AddN_17913439732236309768_kernel0_param_1];
- ld.param.u64 %rd3, [Fused_AddN_17913439732236309768_kernel0_param_2];
- ld.param.u64 %rd4, [Fused_AddN_17913439732236309768_kernel0_param_3];
- mov.u32 %r1, %tid.x;
- setp.gt.s32 %p1, %r1, 167;
- @%p1 bra BB0_2;
-
- cvta.to.global.u64 %rd5, %rd4;
- mov.u32 %r2, %ctaid.x;
- shl.b32 %r3, %r1, 2;
- mad.lo.s32 %r4, %r2, 672, %r3;
- cvta.to.global.u64 %rd6, %rd3;
- mul.wide.s32 %rd7, %r4, 4;
- add.s64 %rd8, %rd6, %rd7;
- ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd8];
- cvta.to.global.u64 %rd9, %rd2;
- add.s64 %rd10, %rd9, %rd7;
- ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd10];
- cvta.to.global.u64 %rd11, %rd1;
- add.s64 %rd12, %rd11, %rd7;
- ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd12];
- add.f32 %f25, %f17, %f9;
- add.f32 %f26, %f18, %f10;
- add.f32 %f27, %f19, %f11;
- add.f32 %f28, %f20, %f12;
- add.s64 %rd13, %rd5, %rd7;
- add.f32 %f29, %f28, %f4;
- add.f32 %f30, %f27, %f3;
- add.f32 %f31, %f26, %f2;
- add.f32 %f32, %f25, %f1;
- st.global.v4.f32 [%rd13], {%f32, %f31, %f30, %f29};
-
- BB0_2:
- ret;
- }
-
|