|
- //
- // Generated by NVIDIA NVVM Compiler
- //
- // Compiler Build ID: CL-26907403
- // Cuda compilation tools, release 10.1, V10.1.243
- // Based on LLVM 3.4svn
- //
-
- .version 6.4
- .target sm_60
- .address_size 64
-
- // .globl Fused_BroadcastTo_Sub_Mul_Mul_split_10239688874233782268_kernel0
-
- .visible .entry Fused_BroadcastTo_Sub_Mul_Mul_split_10239688874233782268_kernel0(
- .param .u64 Fused_BroadcastTo_Sub_Mul_Mul_split_10239688874233782268_kernel0_param_0,
- .param .u64 Fused_BroadcastTo_Sub_Mul_Mul_split_10239688874233782268_kernel0_param_1,
- .param .u64 Fused_BroadcastTo_Sub_Mul_Mul_split_10239688874233782268_kernel0_param_2,
- .param .u64 Fused_BroadcastTo_Sub_Mul_Mul_split_10239688874233782268_kernel0_param_3
- )
- {
- .reg .pred %p<8>;
- .reg .f32 %f<55>;
- .reg .b32 %r<8>;
- .reg .b64 %rd<20>;
-
-
- ld.param.u64 %rd7, [Fused_BroadcastTo_Sub_Mul_Mul_split_10239688874233782268_kernel0_param_0];
- ld.param.u64 %rd8, [Fused_BroadcastTo_Sub_Mul_Mul_split_10239688874233782268_kernel0_param_1];
- ld.param.u64 %rd6, [Fused_BroadcastTo_Sub_Mul_Mul_split_10239688874233782268_kernel0_param_2];
- ld.param.u64 %rd9, [Fused_BroadcastTo_Sub_Mul_Mul_split_10239688874233782268_kernel0_param_3];
- cvta.to.global.u64 %rd1, %rd9;
- cvta.to.global.u64 %rd2, %rd8;
- cvta.to.global.u64 %rd3, %rd7;
- mov.u32 %r1, %ctaid.x;
- setp.gt.s32 %p1, %r1, 12;
- mov.u32 %r2, %tid.x;
- setp.gt.s32 %p2, %r2, 17;
- or.pred %p3, %p1, %p2;
- @%p3 bra BB0_5;
-
- setp.lt.s32 %p4, %r1, 12;
- @%p4 bra BB0_4;
- bra.uni BB0_2;
-
- BB0_4:
- shl.b32 %r4, %r2, 2;
- mad.lo.s32 %r5, %r1, 72, %r4;
- mul.wide.s32 %rd14, %r5, 4;
- add.s64 %rd15, %rd3, %rd14;
- ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd15];
- add.s64 %rd16, %rd2, %rd14;
- ld.global.nc.v4.f32 {%f33, %f34, %f35, %f36}, [%rd16];
- add.f32 %f41, %f33, 0fBF800000;
- add.f32 %f42, %f34, 0fBF800000;
- add.f32 %f43, %f35, 0fBF800000;
- add.f32 %f44, %f36, 0fBF800000;
- add.s64 %rd17, %rd1, %rd14;
- mul.f32 %f45, %f44, %f28;
- mul.f32 %f46, %f43, %f27;
- mul.f32 %f47, %f42, %f26;
- mul.f32 %f48, %f41, %f25;
- st.global.v4.f32 [%rd17], {%f48, %f47, %f46, %f45};
- bra.uni BB0_5;
-
- BB0_2:
- setp.gt.s32 %p5, %r2, 8;
- @%p5 bra BB0_5;
-
- shl.b32 %r3, %r2, 2;
- mul.wide.s32 %rd10, %r3, 4;
- add.s64 %rd11, %rd3, %rd10;
- ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd11+3456];
- add.s64 %rd12, %rd2, %rd10;
- ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd12+3456];
- add.f32 %f17, %f9, 0fBF800000;
- add.f32 %f18, %f10, 0fBF800000;
- add.f32 %f19, %f11, 0fBF800000;
- add.f32 %f20, %f12, 0fBF800000;
- add.s64 %rd13, %rd1, %rd10;
- mul.f32 %f21, %f20, %f4;
- mul.f32 %f22, %f19, %f3;
- mul.f32 %f23, %f18, %f2;
- mul.f32 %f24, %f17, %f1;
- st.global.v4.f32 [%rd13+3456], {%f24, %f23, %f22, %f21};
-
- BB0_5:
- cvta.to.global.u64 %rd18, %rd6;
- mad.lo.s32 %r6, %r1, 60, %r2;
- mul.wide.s32 %rd19, %r6, 4;
- add.s64 %rd4, %rd2, %rd19;
- add.s64 %rd5, %rd18, %rd19;
- setp.gt.s32 %p6, %r2, 59;
- @%p6 bra BB0_7;
-
- ld.global.nc.f32 %f49, [%rd4];
- add.f32 %f50, %f49, 0fBF800000;
- mul.f32 %f51, %f50, %f50;
- st.global.f32 [%rd5], %f51;
-
- BB0_7:
- add.s32 %r7, %r2, 36;
- setp.gt.s32 %p7, %r7, 59;
- @%p7 bra BB0_9;
-
- ld.global.nc.f32 %f52, [%rd4+144];
- add.f32 %f53, %f52, 0fBF800000;
- mul.f32 %f54, %f53, %f53;
- st.global.f32 [%rd5+144], %f54;
-
- BB0_9:
- ret;
- }
-
|