|
- //
- // Generated by NVIDIA NVVM Compiler
- //
- // Compiler Build ID: CL-26907403
- // Cuda compilation tools, release 10.1, V10.1.243
- // Based on LLVM 3.4svn
- //
-
- .version 6.4
- .target sm_60
- .address_size 64
-
- // .globl Fused_ReduceSum_split_11517375051859762236_kernel0
- // _ZZ50Fused_ReduceSum_split_11517375051859762236_kernel0E18input_0_red_shared has been demoted
- // _ZZ50Fused_ReduceSum_split_11517375051859762236_kernel0E8red_buf0 has been demoted
-
- .visible .entry Fused_ReduceSum_split_11517375051859762236_kernel0(
- .param .u64 Fused_ReduceSum_split_11517375051859762236_kernel0_param_0,
- .param .u64 Fused_ReduceSum_split_11517375051859762236_kernel0_param_1,
- .param .u64 Fused_ReduceSum_split_11517375051859762236_kernel0_param_2
- )
- {
- .reg .pred %p<15>;
- .reg .f32 %f<59>;
- .reg .b32 %r<49>;
- .reg .b64 %rd<9>;
- // demoted variable
- .shared .align 4 .b8 _ZZ50Fused_ReduceSum_split_11517375051859762236_kernel0E18input_0_red_shared[32];
- // demoted variable
- .shared .align 4 .b8 _ZZ50Fused_ReduceSum_split_11517375051859762236_kernel0E8red_buf0[4096];
-
- ld.param.u64 %rd2, [Fused_ReduceSum_split_11517375051859762236_kernel0_param_0];
- ld.param.u64 %rd3, [Fused_ReduceSum_split_11517375051859762236_kernel0_param_1];
- mov.u32 %r1, %tid.x;
- mov.u32 %r8, %tid.y;
- shl.b32 %r9, %r8, 2;
- mov.u32 %r10, _ZZ50Fused_ReduceSum_split_11517375051859762236_kernel0E18input_0_red_shared;
- add.s32 %r2, %r10, %r9;
- setp.ne.s32 %p2, %r1, 0;
- @%p2 bra BB0_2;
-
- mov.u32 %r11, 0;
- st.shared.u32 [%r2], %r11;
-
- BB0_2:
- cvta.to.global.u64 %rd1, %rd2;
- bar.sync 0;
- mov.u32 %r4, %ctaid.y;
- shl.b32 %r12, %r4, 19;
- shl.b32 %r13, %r8, 16;
- mov.u32 %r14, %ctaid.x;
- shl.b32 %r15, %r14, 10;
- add.s32 %r16, %r1, %r13;
- add.s32 %r17, %r16, %r12;
- add.s32 %r18, %r17, %r15;
- mul.wide.s32 %rd4, %r18, 4;
- add.s64 %rd5, %rd1, %rd4;
- ld.global.nc.f32 %f2, [%rd5];
- add.f32 %f3, %f2, 0f00000000;
- sub.f32 %f4, %f3, %f2;
- ld.global.nc.f32 %f5, [%rd5+512];
- sub.f32 %f6, %f5, %f4;
- add.f32 %f7, %f3, %f6;
- sub.f32 %f8, %f7, %f3;
- sub.f32 %f9, %f8, %f6;
- ld.global.nc.f32 %f10, [%rd5+1024];
- sub.f32 %f11, %f10, %f9;
- add.f32 %f12, %f7, %f11;
- sub.f32 %f13, %f12, %f7;
- sub.f32 %f14, %f13, %f11;
- ld.global.nc.f32 %f15, [%rd5+1536];
- sub.f32 %f16, %f15, %f14;
- add.f32 %f17, %f12, %f16;
- sub.f32 %f18, %f17, %f12;
- sub.f32 %f19, %f18, %f16;
- ld.global.nc.f32 %f20, [%rd5+2048];
- sub.f32 %f21, %f20, %f19;
- add.f32 %f22, %f17, %f21;
- sub.f32 %f23, %f22, %f17;
- sub.f32 %f24, %f23, %f21;
- ld.global.nc.f32 %f25, [%rd5+2560];
- sub.f32 %f26, %f25, %f24;
- add.f32 %f27, %f22, %f26;
- sub.f32 %f28, %f27, %f22;
- sub.f32 %f29, %f28, %f26;
- ld.global.nc.f32 %f30, [%rd5+3072];
- sub.f32 %f31, %f30, %f29;
- add.f32 %f32, %f27, %f31;
- sub.f32 %f33, %f32, %f27;
- sub.f32 %f34, %f33, %f31;
- ld.global.nc.f32 %f35, [%rd5+3584];
- sub.f32 %f36, %f35, %f34;
- add.f32 %f37, %f32, %f36;
- mov.u32 %r19, %ntid.x;
- mad.lo.s32 %r20, %r19, %r8, %r1;
- and.b32 %r5, %r20, 127;
- and.b32 %r6, %r20, -128;
- add.s32 %r21, %r6, %r5;
- shl.b32 %r22, %r21, 2;
- mov.u32 %r23, _ZZ50Fused_ReduceSum_split_11517375051859762236_kernel0E8red_buf0;
- add.s32 %r7, %r23, %r22;
- st.shared.f32 [%r7], %f37;
- bar.sync 0;
- setp.gt.u32 %p3, %r5, 63;
- @%p3 bra BB0_4;
-
- ld.shared.f32 %f38, [%r7];
- ld.shared.f32 %f39, [%r7+256];
- add.f32 %f40, %f38, %f39;
- st.shared.f32 [%r7], %f40;
-
- BB0_4:
- bar.sync 0;
- setp.gt.u32 %p4, %r5, 31;
- @%p4 bra BB0_6;
-
- ld.shared.f32 %f41, [%r7];
- ld.shared.f32 %f42, [%r7+128];
- add.f32 %f43, %f41, %f42;
- st.shared.f32 [%r7], %f43;
-
- BB0_6:
- setp.lt.u32 %p1, %r5, 32;
- bar.sync 0;
- @!%p1 bra BB0_9;
- bra.uni BB0_7;
-
- BB0_7:
- ld.shared.f32 %f44, [%r7];
- mov.b32 %r24, %f44;
- mov.u32 %r25, 2;
- mov.u32 %r26, 31;
- mov.u32 %r27, 16;
- mov.u32 %r28, -1;
- shfl.sync.down.b32 %r29|%p5, %r24, %r27, %r26, %r28;
- mov.b32 %f45, %r29;
- add.f32 %f46, %f44, %f45;
- mov.b32 %r30, %f46;
- mov.u32 %r31, 8;
- shfl.sync.down.b32 %r32|%p6, %r30, %r31, %r26, %r28;
- mov.b32 %f47, %r32;
- add.f32 %f48, %f46, %f47;
- mov.b32 %r33, %f48;
- mov.u32 %r34, 4;
- shfl.sync.down.b32 %r35|%p7, %r33, %r34, %r26, %r28;
- mov.b32 %f49, %r35;
- add.f32 %f50, %f48, %f49;
- mov.b32 %r36, %f50;
- shfl.sync.down.b32 %r37|%p8, %r36, %r25, %r26, %r28;
- mov.b32 %f51, %r37;
- add.f32 %f52, %f50, %f51;
- mov.b32 %r38, %f52;
- mov.u32 %r39, 1;
- shfl.sync.down.b32 %r40|%p9, %r38, %r39, %r26, %r28;
- mov.b32 %f53, %r40;
- add.f32 %f1, %f52, %f53;
- setp.ne.s32 %p10, %r5, 0;
- @%p10 bra BB0_9;
-
- st.shared.f32 [%r7], %f1;
-
- BB0_9:
- bar.sync 0;
- setp.ne.s32 %p11, %r5, 0;
- @%p11 bra BB0_11;
-
- ld.shared.f32 %f54, [%r2];
- shl.b32 %r41, %r6, 2;
- add.s32 %r43, %r23, %r41;
- ld.shared.f32 %f55, [%r43];
- add.f32 %f56, %f54, %f55;
- st.shared.f32 [%r2], %f56;
-
- BB0_11:
- bar.sync 0;
- setp.ne.s32 %p12, %r8, 0;
- setp.gt.s32 %p13, %r1, 7;
- or.pred %p14, %p13, %p12;
- @%p14 bra BB0_13;
-
- cvta.to.global.u64 %rd6, %rd3;
- shl.b32 %r44, %r1, 2;
- add.s32 %r46, %r10, %r44;
- ld.shared.f32 %f57, [%r46];
- shl.b32 %r47, %r4, 3;
- add.s32 %r48, %r47, %r1;
- mul.wide.s32 %rd7, %r48, 4;
- add.s64 %rd8, %rd6, %rd7;
- atom.global.add.f32 %f58, [%rd8], %f57;
-
- BB0_13:
- bar.sync 0;
- ret;
- }
-
|