|
- //
- // Generated by NVIDIA NVVM Compiler
- //
- // Compiler Build ID: CL-26907403
- // Cuda compilation tools, release 10.1, V10.1.243
- // Based on LLVM 3.4svn
- //
-
- .version 6.4
- .target sm_60
- .address_size 64
-
- // .globl Fused_BiasAdd_3845489885241350506_kernel0
- // _ZZ41Fused_BiasAdd_3845489885241350506_kernel0E14input_0_shared has been demoted
-
- .visible .entry Fused_BiasAdd_3845489885241350506_kernel0(
- .param .u64 Fused_BiasAdd_3845489885241350506_kernel0_param_0,
- .param .u64 Fused_BiasAdd_3845489885241350506_kernel0_param_1,
- .param .u64 Fused_BiasAdd_3845489885241350506_kernel0_param_2
- )
- {
- .reg .pred %p<2>;
- .reg .f32 %f<15>;
- .reg .b32 %r<26>;
- .reg .b64 %rd<12>;
- // demoted variable
- .shared .align 4 .b8 _ZZ41Fused_BiasAdd_3845489885241350506_kernel0E14input_0_shared[16];
-
- ld.param.u64 %rd3, [Fused_BiasAdd_3845489885241350506_kernel0_param_0];
- ld.param.u64 %rd4, [Fused_BiasAdd_3845489885241350506_kernel0_param_1];
- ld.param.u64 %rd5, [Fused_BiasAdd_3845489885241350506_kernel0_param_2];
- mov.u32 %r1, %tid.x;
- setp.gt.s32 %p1, %r1, 3;
- @%p1 bra BB0_2;
-
- mov.u32 %r3, %ctaid.y;
- shl.b32 %r4, %r3, 2;
- add.s32 %r5, %r4, %r1;
- cvta.to.global.u64 %rd6, %rd4;
- mul.wide.s32 %rd7, %r5, 4;
- add.s64 %rd8, %rd6, %rd7;
- ld.global.nc.f32 %f1, [%rd8];
- shl.b32 %r6, %r1, 2;
- mov.u32 %r7, _ZZ41Fused_BiasAdd_3845489885241350506_kernel0E14input_0_shared;
- add.s32 %r8, %r7, %r6;
- st.shared.f32 [%r8], %f1;
-
- BB0_2:
- cvta.to.global.u64 %rd1, %rd5;
- cvta.to.global.u64 %rd2, %rd3;
- mov.u32 %r2, %ctaid.y;
- bar.sync 0;
- shl.b32 %r9, %r2, 14;
- shr.s32 %r10, %r1, 31;
- shr.u32 %r11, %r10, 24;
- add.s32 %r12, %r1, %r11;
- shr.s32 %r13, %r12, 8;
- shl.b32 %r14, %r13, 12;
- add.s32 %r15, %r14, %r9;
- mov.u32 %r16, %ctaid.x;
- shl.b32 %r17, %r16, 10;
- add.s32 %r18, %r15, %r17;
- and.b32 %r19, %r12, 1073741568;
- sub.s32 %r20, %r1, %r19;
- shl.b32 %r21, %r20, 2;
- add.s32 %r22, %r18, %r21;
- mul.wide.s32 %rd9, %r22, 4;
- add.s64 %rd10, %rd2, %rd9;
- ld.global.nc.v4.f32 {%f2, %f3, %f4, %f5}, [%rd10];
- shl.b32 %r23, %r13, 2;
- mov.u32 %r24, _ZZ41Fused_BiasAdd_3845489885241350506_kernel0E14input_0_shared;
- add.s32 %r25, %r24, %r23;
- ld.shared.f32 %f10, [%r25];
- add.s64 %rd11, %rd1, %rd9;
- add.f32 %f11, %f5, %f10;
- add.f32 %f12, %f4, %f10;
- add.f32 %f13, %f3, %f10;
- add.f32 %f14, %f2, %f10;
- st.global.v4.f32 [%rd11], {%f14, %f13, %f12, %f11};
- bar.sync 0;
- ret;
- }
-
|