|
- //
- // Generated by NVIDIA NVVM Compiler
- //
- // Compiler Build ID: CL-26907403
- // Cuda compilation tools, release 10.1, V10.1.243
- // Based on LLVM 3.4svn
- //
-
- .version 6.4
- .target sm_60
- .address_size 64
-
- // .globl Fused_ReduceSum_split_13638779044274093626_kernel0
- // _ZZ50Fused_ReduceSum_split_13638779044274093626_kernel0E18input_0_red_shared has been demoted
- // _ZZ50Fused_ReduceSum_split_13638779044274093626_kernel0E8red_buf0 has been demoted
-
- .visible .entry Fused_ReduceSum_split_13638779044274093626_kernel0(
- .param .u64 Fused_ReduceSum_split_13638779044274093626_kernel0_param_0,
- .param .u64 Fused_ReduceSum_split_13638779044274093626_kernel0_param_1
- )
- {
- .reg .pred %p<17>;
- .reg .f32 %f<42>;
- .reg .b32 %r<32>;
- .reg .b64 %rd<10>;
- // demoted variable
- .shared .align 4 .b8 _ZZ50Fused_ReduceSum_split_13638779044274093626_kernel0E18input_0_red_shared[4];
- // demoted variable
- .shared .align 4 .b8 _ZZ50Fused_ReduceSum_split_13638779044274093626_kernel0E8red_buf0[2048];
-
- ld.param.u64 %rd3, [Fused_ReduceSum_split_13638779044274093626_kernel0_param_0];
- ld.param.u64 %rd4, [Fused_ReduceSum_split_13638779044274093626_kernel0_param_1];
- mov.u32 %r3, %tid.x;
- setp.ne.s32 %p3, %r3, 0;
- @%p3 bra BB0_2;
-
- mov.u32 %r4, 0;
- st.shared.u32 [_ZZ50Fused_ReduceSum_split_13638779044274093626_kernel0E18input_0_red_shared], %r4;
-
- BB0_2:
- cvta.to.global.u64 %rd1, %rd3;
- bar.sync 0;
- mul.wide.s32 %rd5, %r3, 4;
- add.s64 %rd2, %rd1, %rd5;
- mov.f32 %f39, 0f00000000;
- setp.gt.s32 %p4, %r3, 899;
- mov.f32 %f40, %f39;
- @%p4 bra BB0_4;
-
- ld.global.nc.f32 %f10, [%rd2];
- add.f32 %f39, %f10, 0f00000000;
- sub.f32 %f40, %f39, %f10;
-
- BB0_4:
- add.s32 %r7, %r3, 512;
- setp.gt.s32 %p5, %r7, 899;
- @%p5 bra BB0_6;
-
- ld.global.nc.f32 %f11, [%rd2+2048];
- sub.f32 %f12, %f11, %f40;
- add.f32 %f39, %f39, %f12;
-
- BB0_6:
- mov.u32 %r9, %tid.y;
- mov.u32 %r10, %ntid.x;
- mad.lo.s32 %r1, %r9, %r10, %r3;
- shl.b32 %r12, %r1, 2;
- mov.u32 %r13, _ZZ50Fused_ReduceSum_split_13638779044274093626_kernel0E8red_buf0;
- add.s32 %r2, %r13, %r12;
- st.shared.f32 [%r2], %f39;
- bar.sync 0;
- setp.gt.s32 %p6, %r1, 255;
- @%p6 bra BB0_8;
-
- ld.shared.f32 %f13, [%r2];
- ld.shared.f32 %f14, [%r2+1024];
- add.f32 %f15, %f13, %f14;
- st.shared.f32 [%r2], %f15;
-
- BB0_8:
- bar.sync 0;
- setp.gt.s32 %p7, %r1, 127;
- @%p7 bra BB0_10;
-
- ld.shared.f32 %f16, [%r2];
- ld.shared.f32 %f17, [%r2+512];
- add.f32 %f18, %f16, %f17;
- st.shared.f32 [%r2], %f18;
-
- BB0_10:
- bar.sync 0;
- setp.gt.s32 %p8, %r1, 63;
- @%p8 bra BB0_12;
-
- ld.shared.f32 %f19, [%r2];
- ld.shared.f32 %f20, [%r2+256];
- add.f32 %f21, %f19, %f20;
- st.shared.f32 [%r2], %f21;
-
- BB0_12:
- bar.sync 0;
- setp.gt.s32 %p9, %r1, 31;
- @%p9 bra BB0_14;
-
- ld.shared.f32 %f22, [%r2];
- ld.shared.f32 %f23, [%r2+128];
- add.f32 %f24, %f22, %f23;
- st.shared.f32 [%r2], %f24;
-
- BB0_14:
- setp.lt.s32 %p1, %r1, 32;
- bar.sync 0;
- @!%p1 bra BB0_17;
- bra.uni BB0_15;
-
- BB0_15:
- ld.shared.f32 %f25, [%r2];
- mov.b32 %r14, %f25;
- mov.u32 %r15, 2;
- mov.u32 %r16, 31;
- mov.u32 %r17, 16;
- mov.u32 %r18, -1;
- shfl.sync.down.b32 %r19|%p10, %r14, %r17, %r16, %r18;
- mov.b32 %f26, %r19;
- add.f32 %f27, %f25, %f26;
- mov.b32 %r20, %f27;
- mov.u32 %r21, 8;
- shfl.sync.down.b32 %r22|%p11, %r20, %r21, %r16, %r18;
- mov.b32 %f28, %r22;
- add.f32 %f29, %f27, %f28;
- mov.b32 %r23, %f29;
- mov.u32 %r24, 4;
- shfl.sync.down.b32 %r25|%p12, %r23, %r24, %r16, %r18;
- mov.b32 %f30, %r25;
- add.f32 %f31, %f29, %f30;
- mov.b32 %r26, %f31;
- shfl.sync.down.b32 %r27|%p13, %r26, %r15, %r16, %r18;
- mov.b32 %f32, %r27;
- add.f32 %f33, %f31, %f32;
- mov.b32 %r28, %f33;
- mov.u32 %r29, 1;
- shfl.sync.down.b32 %r30|%p14, %r28, %r29, %r16, %r18;
- mov.b32 %f34, %r30;
- add.f32 %f7, %f33, %f34;
- setp.ne.s32 %p15, %r1, 0;
- @%p15 bra BB0_17;
-
- st.shared.f32 [_ZZ50Fused_ReduceSum_split_13638779044274093626_kernel0E8red_buf0], %f7;
-
- BB0_17:
- bar.sync 0;
- setp.ne.s32 %p16, %r1, 0;
- @%p16 bra BB0_19;
-
- ld.shared.f32 %f35, [_ZZ50Fused_ReduceSum_split_13638779044274093626_kernel0E18input_0_red_shared];
- ld.shared.f32 %f36, [_ZZ50Fused_ReduceSum_split_13638779044274093626_kernel0E8red_buf0];
- add.f32 %f37, %f35, %f36;
- st.shared.f32 [_ZZ50Fused_ReduceSum_split_13638779044274093626_kernel0E18input_0_red_shared], %f37;
-
- BB0_19:
- setp.eq.s32 %p2, %r3, 0;
- bar.sync 0;
- @!%p2 bra BB0_21;
- bra.uni BB0_20;
-
- BB0_20:
- ld.shared.f32 %f38, [_ZZ50Fused_ReduceSum_split_13638779044274093626_kernel0E18input_0_red_shared];
- cvta.to.global.u64 %rd9, %rd4;
- st.global.f32 [%rd9], %f38;
-
- BB0_21:
- bar.sync 0;
- ret;
- }
-
|