|
- //
- // Generated by NVIDIA NVVM Compiler
- //
- // Compiler Build ID: CL-26907403
- // Cuda compilation tools, release 10.1, V10.1.243
- // Based on LLVM 3.4svn
- //
-
- .version 6.4
- .target sm_60
- .address_size 64
-
- // .globl Fused_ReduceSum_split_7048982259467132493_kernel0
- // _ZZ49Fused_ReduceSum_split_7048982259467132493_kernel0E18input_0_red_shared has been demoted
- // _ZZ49Fused_ReduceSum_split_7048982259467132493_kernel0E8red_buf0 has been demoted
-
- .visible .entry Fused_ReduceSum_split_7048982259467132493_kernel0(
- .param .u64 Fused_ReduceSum_split_7048982259467132493_kernel0_param_0,
- .param .u64 Fused_ReduceSum_split_7048982259467132493_kernel0_param_1,
- .param .u64 Fused_ReduceSum_split_7048982259467132493_kernel0_param_2
- )
- {
- .reg .pred %p<16>;
- .reg .f32 %f<74>;
- .reg .b32 %r<41>;
- .reg .b64 %rd<9>;
- // demoted variable
- .shared .align 4 .b8 _ZZ49Fused_ReduceSum_split_7048982259467132493_kernel0E18input_0_red_shared[4];
- // demoted variable
- .shared .align 4 .b8 _ZZ49Fused_ReduceSum_split_7048982259467132493_kernel0E8red_buf0[2048];
-
- ld.param.u64 %rd3, [Fused_ReduceSum_split_7048982259467132493_kernel0_param_0];
- ld.param.u64 %rd4, [Fused_ReduceSum_split_7048982259467132493_kernel0_param_1];
- mov.u32 %r1, %tid.x;
- setp.ne.s32 %p3, %r1, 0;
- @%p3 bra BB0_2;
-
- mov.u32 %r11, 0;
- st.shared.u32 [_ZZ49Fused_ReduceSum_split_7048982259467132493_kernel0E18input_0_red_shared], %r11;
-
- BB0_2:
- bar.sync 0;
- mov.u32 %r2, %ctaid.y;
- mad.lo.s32 %r39, %r2, 16384, %r1;
- cvta.to.global.u64 %rd1, %rd4;
- cvta.to.global.u64 %rd2, %rd3;
- mov.f32 %f72, 0f00000000;
- mov.u32 %r40, -32;
- mov.f32 %f73, %f72;
-
- BB0_3:
- mul.wide.s32 %rd5, %r39, 4;
- add.s64 %rd6, %rd2, %rd5;
- ld.global.nc.f32 %f8, [%rd6];
- sub.f32 %f9, %f8, %f72;
- add.f32 %f10, %f73, %f9;
- sub.f32 %f11, %f10, %f73;
- sub.f32 %f12, %f11, %f9;
- ld.global.nc.f32 %f13, [%rd6+2048];
- sub.f32 %f14, %f13, %f12;
- add.f32 %f15, %f10, %f14;
- sub.f32 %f16, %f15, %f10;
- sub.f32 %f17, %f16, %f14;
- ld.global.nc.f32 %f18, [%rd6+4096];
- sub.f32 %f19, %f18, %f17;
- add.f32 %f20, %f15, %f19;
- sub.f32 %f21, %f20, %f15;
- sub.f32 %f22, %f21, %f19;
- ld.global.nc.f32 %f23, [%rd6+6144];
- sub.f32 %f24, %f23, %f22;
- add.f32 %f25, %f20, %f24;
- sub.f32 %f26, %f25, %f20;
- sub.f32 %f27, %f26, %f24;
- ld.global.nc.f32 %f28, [%rd6+8192];
- sub.f32 %f29, %f28, %f27;
- add.f32 %f30, %f25, %f29;
- sub.f32 %f31, %f30, %f25;
- sub.f32 %f32, %f31, %f29;
- ld.global.nc.f32 %f33, [%rd6+10240];
- sub.f32 %f34, %f33, %f32;
- add.f32 %f35, %f30, %f34;
- sub.f32 %f36, %f35, %f30;
- sub.f32 %f37, %f36, %f34;
- ld.global.nc.f32 %f38, [%rd6+12288];
- sub.f32 %f39, %f38, %f37;
- add.f32 %f40, %f35, %f39;
- sub.f32 %f41, %f40, %f35;
- sub.f32 %f42, %f41, %f39;
- ld.global.nc.f32 %f43, [%rd6+14336];
- sub.f32 %f44, %f43, %f42;
- add.f32 %f73, %f40, %f44;
- sub.f32 %f45, %f73, %f40;
- sub.f32 %f72, %f45, %f44;
- add.s32 %r39, %r39, 4096;
- add.s32 %r40, %r40, 8;
- setp.ne.s32 %p4, %r40, 0;
- @%p4 bra BB0_3;
-
- mov.u32 %r13, %ntid.x;
- mov.u32 %r14, %tid.y;
- mad.lo.s32 %r15, %r14, %r13, %r1;
- and.b32 %r8, %r15, 511;
- and.b32 %r9, %r15, -512;
- add.s32 %r16, %r9, %r8;
- shl.b32 %r17, %r16, 2;
- mov.u32 %r18, _ZZ49Fused_ReduceSum_split_7048982259467132493_kernel0E8red_buf0;
- add.s32 %r10, %r18, %r17;
- st.shared.f32 [%r10], %f73;
- bar.sync 0;
- setp.gt.u32 %p5, %r8, 255;
- @%p5 bra BB0_6;
-
- ld.shared.f32 %f46, [%r10];
- ld.shared.f32 %f47, [%r10+1024];
- add.f32 %f48, %f46, %f47;
- st.shared.f32 [%r10], %f48;
-
- BB0_6:
- bar.sync 0;
- setp.gt.u32 %p6, %r8, 127;
- @%p6 bra BB0_8;
-
- ld.shared.f32 %f49, [%r10];
- ld.shared.f32 %f50, [%r10+512];
- add.f32 %f51, %f49, %f50;
- st.shared.f32 [%r10], %f51;
-
- BB0_8:
- bar.sync 0;
- setp.gt.u32 %p7, %r8, 63;
- @%p7 bra BB0_10;
-
- ld.shared.f32 %f52, [%r10];
- ld.shared.f32 %f53, [%r10+256];
- add.f32 %f54, %f52, %f53;
- st.shared.f32 [%r10], %f54;
-
- BB0_10:
- bar.sync 0;
- setp.gt.u32 %p8, %r8, 31;
- @%p8 bra BB0_12;
-
- ld.shared.f32 %f55, [%r10];
- ld.shared.f32 %f56, [%r10+128];
- add.f32 %f57, %f55, %f56;
- st.shared.f32 [%r10], %f57;
-
- BB0_12:
- setp.lt.u32 %p1, %r8, 32;
- bar.sync 0;
- @!%p1 bra BB0_15;
- bra.uni BB0_13;
-
- BB0_13:
- ld.shared.f32 %f58, [%r10];
- mov.b32 %r19, %f58;
- mov.u32 %r20, 2;
- mov.u32 %r21, 31;
- mov.u32 %r22, 16;
- mov.u32 %r23, -1;
- shfl.sync.down.b32 %r24|%p9, %r19, %r22, %r21, %r23;
- mov.b32 %f59, %r24;
- add.f32 %f60, %f58, %f59;
- mov.b32 %r25, %f60;
- mov.u32 %r26, 8;
- shfl.sync.down.b32 %r27|%p10, %r25, %r26, %r21, %r23;
- mov.b32 %f61, %r27;
- add.f32 %f62, %f60, %f61;
- mov.b32 %r28, %f62;
- mov.u32 %r29, 4;
- shfl.sync.down.b32 %r30|%p11, %r28, %r29, %r21, %r23;
- mov.b32 %f63, %r30;
- add.f32 %f64, %f62, %f63;
- mov.b32 %r31, %f64;
- shfl.sync.down.b32 %r32|%p12, %r31, %r20, %r21, %r23;
- mov.b32 %f65, %r32;
- add.f32 %f66, %f64, %f65;
- mov.b32 %r33, %f66;
- mov.u32 %r34, 1;
- shfl.sync.down.b32 %r35|%p13, %r33, %r34, %r21, %r23;
- mov.b32 %f67, %r35;
- add.f32 %f5, %f66, %f67;
- setp.ne.s32 %p14, %r8, 0;
- @%p14 bra BB0_15;
-
- st.shared.f32 [%r10], %f5;
-
- BB0_15:
- bar.sync 0;
- setp.ne.s32 %p15, %r8, 0;
- @%p15 bra BB0_17;
-
- ld.shared.f32 %f68, [_ZZ49Fused_ReduceSum_split_7048982259467132493_kernel0E18input_0_red_shared];
- shl.b32 %r36, %r9, 2;
- add.s32 %r38, %r18, %r36;
- ld.shared.f32 %f69, [%r38];
- add.f32 %f70, %f68, %f69;
- st.shared.f32 [_ZZ49Fused_ReduceSum_split_7048982259467132493_kernel0E18input_0_red_shared], %f70;
-
- BB0_17:
- setp.eq.s32 %p2, %r1, 0;
- bar.sync 0;
- @!%p2 bra BB0_19;
- bra.uni BB0_18;
-
- BB0_18:
- ld.shared.f32 %f71, [_ZZ49Fused_ReduceSum_split_7048982259467132493_kernel0E18input_0_red_shared];
- mul.wide.s32 %rd7, %r2, 4;
- add.s64 %rd8, %rd1, %rd7;
- st.global.f32 [%rd8], %f71;
-
- BB0_19:
- bar.sync 0;
- ret;
- }
-
|