|
- //
- // Generated by NVIDIA NVVM Compiler
- //
- // Compiler Build ID: CL-26907403
- // Cuda compilation tools, release 10.1, V10.1.243
- // Based on LLVM 3.4svn
- //
-
- .version 6.4
- .target sm_60
- .address_size 64
-
- // .globl Fused_Tanh_fusion_11812052120777794614_kernel0
-
- .visible .entry Fused_Tanh_fusion_11812052120777794614_kernel0(
- .param .u64 Fused_Tanh_fusion_11812052120777794614_kernel0_param_0,
- .param .u64 Fused_Tanh_fusion_11812052120777794614_kernel0_param_1
- )
- {
- .reg .pred %p<4>;
- .reg .f32 %f<33>;
- .reg .b32 %r<10>;
- .reg .b64 %rd<9>;
-
-
- ld.param.u64 %rd2, [Fused_Tanh_fusion_11812052120777794614_kernel0_param_0];
- ld.param.u64 %rd1, [Fused_Tanh_fusion_11812052120777794614_kernel0_param_1];
- mov.u32 %r2, %ctaid.x;
- shl.b32 %r3, %r2, 9;
- mov.u32 %r4, %tid.x;
- add.s32 %r1, %r3, %r4;
- cvta.to.global.u64 %rd3, %rd2;
- mul.wide.s32 %rd4, %r1, 4;
- add.s64 %rd5, %rd3, %rd4;
- ld.global.nc.f32 %f1, [%rd5];
- abs.f32 %f2, %f1;
- setp.ltu.f32 %p1, %f2, 0f3F0CCCCD;
- @%p1 bra BB0_2;
- bra.uni BB0_1;
-
- BB0_2:
- mul.f32 %f21, %f1, %f1;
- mov.f32 %f22, 0fBD57BE66;
- mov.f32 %f23, 0f3C86A81B;
- fma.rn.f32 %f24, %f23, %f21, %f22;
- mov.f32 %f25, 0f3E08677B;
- fma.rn.f32 %f26, %f24, %f21, %f25;
- mov.f32 %f27, 0fBEAAAA29;
- fma.rn.f32 %f28, %f26, %f21, %f27;
- mul.f32 %f29, %f21, %f28;
- fma.rn.f32 %f30, %f29, %f1, %f1;
- add.f32 %f31, %f1, %f1;
- setp.eq.f32 %p3, %f1, 0f00000000;
- selp.f32 %f32, %f31, %f30, %p3;
- bra.uni BB0_3;
-
- BB0_1:
- add.f32 %f8, %f2, %f2;
- mul.f32 %f9, %f8, 0f3FB8AA3B;
- cvt.rzi.f32.f32 %f10, %f9;
- mov.f32 %f11, 0fBF317200;
- fma.rn.f32 %f12, %f10, %f11, %f8;
- mov.f32 %f13, 0fB5BFBE8E;
- fma.rn.f32 %f14, %f10, %f13, %f12;
- mul.f32 %f15, %f14, 0f3FB8AA3B;
- ex2.approx.ftz.f32 %f16, %f15;
- ex2.approx.f32 %f17, %f10;
- mov.f32 %f18, 0f3F800000;
- fma.rn.f32 %f7, %f16, %f17, %f18;
- // inline asm
- rcp.approx.ftz.f32 %f6,%f7;
- // inline asm
- mov.f32 %f19, 0fC0000000;
- fma.rn.f32 %f20, %f6, %f19, %f18;
- mov.b32 %r5, %f20;
- setp.ltu.f32 %p2, %f2, 0f42B00000;
- selp.b32 %r6, %r5, 1065353216, %p2;
- mov.b32 %r7, %f1;
- and.b32 %r8, %r7, -2147483648;
- or.b32 %r9, %r6, %r8;
- mov.b32 %f32, %r9;
-
- BB0_3:
- cvta.to.global.u64 %rd6, %rd1;
- add.s64 %rd8, %rd6, %rd4;
- st.global.f32 [%rd8], %f32;
- ret;
- }
-
|