|
- //
- // Generated by NVIDIA NVVM Compiler
- //
- // Compiler Build ID: CL-26907403
- // Cuda compilation tools, release 10.1, V10.1.243
- // Based on LLVM 3.4svn
- //
-
- .version 6.4
- .target sm_60
- .address_size 64
-
- // .globl Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0
- // _ZZ61Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0E16T_abs_red_shared has been demoted
- // _ZZ61Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0E8red_buf0 has been demoted
-
- .visible .entry Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0(
- .param .u64 Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0_param_0,
- .param .u64 Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0_param_1,
- .param .u64 Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0_param_2,
- .param .u64 Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0_param_3,
- .param .u64 Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0_param_4,
- .param .u64 Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0_param_5
- )
- {
- .reg .pred %p<34>;
- .reg .f32 %f<268>;
- .reg .b32 %r<58>;
- .reg .b64 %rd<21>;
- // demoted variable
- .shared .align 4 .b8 _ZZ61Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0E16T_abs_red_shared[4];
- // demoted variable
- .shared .align 4 .b8 _ZZ61Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0E8red_buf0[4096];
-
- ld.param.u64 %rd13, [Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0_param_0];
- ld.param.u64 %rd9, [Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0_param_1];
- ld.param.u64 %rd10, [Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0_param_2];
- ld.param.u64 %rd11, [Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0_param_4];
- ld.param.u64 %rd12, [Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0_param_5];
- mov.u32 %r5, %ctaid.x;
- mov.u32 %r1, %tid.x;
- mad.lo.s32 %r2, %r5, 6144, %r1;
- cvta.to.global.u64 %rd14, %rd13;
- mul.wide.s32 %rd15, %r2, 4;
- add.s64 %rd1, %rd14, %rd15;
- ld.global.nc.f32 %f1, [%rd1];
- abs.f32 %f2, %f1;
- setp.ltu.f32 %p3, %f2, 0f3F0CCCCD;
- @%p3 bra BB0_2;
- bra.uni BB0_1;
-
- BB0_2:
- mul.f32 %f47, %f1, %f1;
- mov.f32 %f48, 0fBD57BE66;
- mov.f32 %f49, 0f3C86A81B;
- fma.rn.f32 %f50, %f49, %f47, %f48;
- mov.f32 %f51, 0f3E08677B;
- fma.rn.f32 %f52, %f50, %f47, %f51;
- mov.f32 %f53, 0fBEAAAA29;
- fma.rn.f32 %f54, %f52, %f47, %f53;
- mul.f32 %f55, %f47, %f54;
- fma.rn.f32 %f56, %f55, %f1, %f1;
- add.f32 %f57, %f1, %f1;
- setp.eq.f32 %p5, %f1, 0f00000000;
- selp.f32 %f262, %f57, %f56, %p5;
- bra.uni BB0_3;
-
- BB0_1:
- add.f32 %f34, %f2, %f2;
- mul.f32 %f35, %f34, 0f3FB8AA3B;
- cvt.rzi.f32.f32 %f36, %f35;
- mov.f32 %f37, 0fBF317200;
- fma.rn.f32 %f38, %f36, %f37, %f34;
- mov.f32 %f39, 0fB5BFBE8E;
- fma.rn.f32 %f40, %f36, %f39, %f38;
- mul.f32 %f41, %f40, 0f3FB8AA3B;
- ex2.approx.ftz.f32 %f42, %f41;
- ex2.approx.f32 %f43, %f36;
- mov.f32 %f44, 0f3F800000;
- fma.rn.f32 %f33, %f42, %f43, %f44;
- // inline asm
- rcp.approx.ftz.f32 %f32,%f33;
- // inline asm
- mov.f32 %f45, 0fC0000000;
- fma.rn.f32 %f46, %f32, %f45, %f44;
- mov.b32 %r6, %f46;
- setp.ltu.f32 %p4, %f2, 0f42B00000;
- selp.b32 %r7, %r6, 1065353216, %p4;
- mov.b32 %r8, %f1;
- and.b32 %r9, %r8, -2147483648;
- or.b32 %r10, %r7, %r9;
- mov.b32 %f262, %r10;
-
- BB0_3:
- cvta.to.global.u64 %rd16, %rd9;
- cvta.to.global.u64 %rd17, %rd11;
- cvta.to.global.u64 %rd18, %rd12;
- add.s64 %rd2, %rd17, %rd15;
- st.global.f32 [%rd2], %f262;
- add.s64 %rd3, %rd16, %rd15;
- ld.global.nc.f32 %f58, [%rd3];
- sub.f32 %f59, %f262, %f58;
- add.s64 %rd4, %rd18, %rd15;
- st.global.f32 [%rd4], %f59;
- setp.ne.s32 %p6, %r1, 0;
- @%p6 bra BB0_5;
-
- mov.u32 %r11, 0;
- st.shared.u32 [_ZZ61Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0E16T_abs_red_shared], %r11;
-
- BB0_5:
- ld.global.nc.f32 %f6, [%rd1+4096];
- abs.f32 %f7, %f6;
- setp.ltu.f32 %p7, %f7, 0f3F0CCCCD;
- @%p7 bra BB0_7;
- bra.uni BB0_6;
-
- BB0_7:
- mul.f32 %f75, %f6, %f6;
- mov.f32 %f76, 0fBD57BE66;
- mov.f32 %f77, 0f3C86A81B;
- fma.rn.f32 %f78, %f77, %f75, %f76;
- mov.f32 %f79, 0f3E08677B;
- fma.rn.f32 %f80, %f78, %f75, %f79;
- mov.f32 %f81, 0fBEAAAA29;
- fma.rn.f32 %f82, %f80, %f75, %f81;
- mul.f32 %f83, %f75, %f82;
- fma.rn.f32 %f84, %f83, %f6, %f6;
- add.f32 %f85, %f6, %f6;
- setp.eq.f32 %p9, %f6, 0f00000000;
- selp.f32 %f263, %f85, %f84, %p9;
- bra.uni BB0_8;
-
- BB0_6:
- add.f32 %f62, %f7, %f7;
- mul.f32 %f63, %f62, 0f3FB8AA3B;
- cvt.rzi.f32.f32 %f64, %f63;
- mov.f32 %f65, 0fBF317200;
- fma.rn.f32 %f66, %f64, %f65, %f62;
- mov.f32 %f67, 0fB5BFBE8E;
- fma.rn.f32 %f68, %f64, %f67, %f66;
- mul.f32 %f69, %f68, 0f3FB8AA3B;
- ex2.approx.ftz.f32 %f70, %f69;
- ex2.approx.f32 %f71, %f64;
- mov.f32 %f72, 0f3F800000;
- fma.rn.f32 %f61, %f70, %f71, %f72;
- // inline asm
- rcp.approx.ftz.f32 %f60,%f61;
- // inline asm
- mov.f32 %f73, 0fC0000000;
- fma.rn.f32 %f74, %f60, %f73, %f72;
- mov.b32 %r12, %f74;
- setp.ltu.f32 %p8, %f7, 0f42B00000;
- selp.b32 %r13, %r12, 1065353216, %p8;
- mov.b32 %r14, %f6;
- and.b32 %r15, %r14, -2147483648;
- or.b32 %r16, %r13, %r15;
- mov.b32 %f263, %r16;
-
- BB0_8:
- st.global.f32 [%rd2+4096], %f263;
- ld.global.nc.f32 %f86, [%rd3+4096];
- sub.f32 %f87, %f263, %f86;
- st.global.f32 [%rd4+4096], %f87;
- ld.global.nc.f32 %f11, [%rd1+8192];
- abs.f32 %f12, %f11;
- setp.ltu.f32 %p10, %f12, 0f3F0CCCCD;
- @%p10 bra BB0_10;
- bra.uni BB0_9;
-
- BB0_10:
- mul.f32 %f103, %f11, %f11;
- mov.f32 %f104, 0fBD57BE66;
- mov.f32 %f105, 0f3C86A81B;
- fma.rn.f32 %f106, %f105, %f103, %f104;
- mov.f32 %f107, 0f3E08677B;
- fma.rn.f32 %f108, %f106, %f103, %f107;
- mov.f32 %f109, 0fBEAAAA29;
- fma.rn.f32 %f110, %f108, %f103, %f109;
- mul.f32 %f111, %f103, %f110;
- fma.rn.f32 %f112, %f111, %f11, %f11;
- add.f32 %f113, %f11, %f11;
- setp.eq.f32 %p12, %f11, 0f00000000;
- selp.f32 %f264, %f113, %f112, %p12;
- bra.uni BB0_11;
-
- BB0_9:
- add.f32 %f90, %f12, %f12;
- mul.f32 %f91, %f90, 0f3FB8AA3B;
- cvt.rzi.f32.f32 %f92, %f91;
- mov.f32 %f93, 0fBF317200;
- fma.rn.f32 %f94, %f92, %f93, %f90;
- mov.f32 %f95, 0fB5BFBE8E;
- fma.rn.f32 %f96, %f92, %f95, %f94;
- mul.f32 %f97, %f96, 0f3FB8AA3B;
- ex2.approx.ftz.f32 %f98, %f97;
- ex2.approx.f32 %f99, %f92;
- mov.f32 %f100, 0f3F800000;
- fma.rn.f32 %f89, %f98, %f99, %f100;
- // inline asm
- rcp.approx.ftz.f32 %f88,%f89;
- // inline asm
- mov.f32 %f101, 0fC0000000;
- fma.rn.f32 %f102, %f88, %f101, %f100;
- mov.b32 %r17, %f102;
- setp.ltu.f32 %p11, %f12, 0f42B00000;
- selp.b32 %r18, %r17, 1065353216, %p11;
- mov.b32 %r19, %f11;
- and.b32 %r20, %r19, -2147483648;
- or.b32 %r21, %r18, %r20;
- mov.b32 %f264, %r21;
-
- BB0_11:
- st.global.f32 [%rd2+8192], %f264;
- ld.global.nc.f32 %f114, [%rd3+8192];
- sub.f32 %f115, %f264, %f114;
- st.global.f32 [%rd4+8192], %f115;
- ld.global.nc.f32 %f16, [%rd1+12288];
- abs.f32 %f17, %f16;
- setp.ltu.f32 %p13, %f17, 0f3F0CCCCD;
- @%p13 bra BB0_13;
- bra.uni BB0_12;
-
- BB0_13:
- mul.f32 %f131, %f16, %f16;
- mov.f32 %f132, 0fBD57BE66;
- mov.f32 %f133, 0f3C86A81B;
- fma.rn.f32 %f134, %f133, %f131, %f132;
- mov.f32 %f135, 0f3E08677B;
- fma.rn.f32 %f136, %f134, %f131, %f135;
- mov.f32 %f137, 0fBEAAAA29;
- fma.rn.f32 %f138, %f136, %f131, %f137;
- mul.f32 %f139, %f131, %f138;
- fma.rn.f32 %f140, %f139, %f16, %f16;
- add.f32 %f141, %f16, %f16;
- setp.eq.f32 %p15, %f16, 0f00000000;
- selp.f32 %f265, %f141, %f140, %p15;
- bra.uni BB0_14;
-
- BB0_12:
- add.f32 %f118, %f17, %f17;
- mul.f32 %f119, %f118, 0f3FB8AA3B;
- cvt.rzi.f32.f32 %f120, %f119;
- mov.f32 %f121, 0fBF317200;
- fma.rn.f32 %f122, %f120, %f121, %f118;
- mov.f32 %f123, 0fB5BFBE8E;
- fma.rn.f32 %f124, %f120, %f123, %f122;
- mul.f32 %f125, %f124, 0f3FB8AA3B;
- ex2.approx.ftz.f32 %f126, %f125;
- ex2.approx.f32 %f127, %f120;
- mov.f32 %f128, 0f3F800000;
- fma.rn.f32 %f117, %f126, %f127, %f128;
- // inline asm
- rcp.approx.ftz.f32 %f116,%f117;
- // inline asm
- mov.f32 %f129, 0fC0000000;
- fma.rn.f32 %f130, %f116, %f129, %f128;
- mov.b32 %r22, %f130;
- setp.ltu.f32 %p14, %f17, 0f42B00000;
- selp.b32 %r23, %r22, 1065353216, %p14;
- mov.b32 %r24, %f16;
- and.b32 %r25, %r24, -2147483648;
- or.b32 %r26, %r23, %r25;
- mov.b32 %f265, %r26;
-
- BB0_14:
- st.global.f32 [%rd2+12288], %f265;
- ld.global.nc.f32 %f142, [%rd3+12288];
- sub.f32 %f143, %f265, %f142;
- st.global.f32 [%rd4+12288], %f143;
- ld.global.nc.f32 %f21, [%rd1+16384];
- abs.f32 %f22, %f21;
- setp.ltu.f32 %p16, %f22, 0f3F0CCCCD;
- @%p16 bra BB0_16;
- bra.uni BB0_15;
-
- BB0_16:
- mul.f32 %f159, %f21, %f21;
- mov.f32 %f160, 0fBD57BE66;
- mov.f32 %f161, 0f3C86A81B;
- fma.rn.f32 %f162, %f161, %f159, %f160;
- mov.f32 %f163, 0f3E08677B;
- fma.rn.f32 %f164, %f162, %f159, %f163;
- mov.f32 %f165, 0fBEAAAA29;
- fma.rn.f32 %f166, %f164, %f159, %f165;
- mul.f32 %f167, %f159, %f166;
- fma.rn.f32 %f168, %f167, %f21, %f21;
- add.f32 %f169, %f21, %f21;
- setp.eq.f32 %p18, %f21, 0f00000000;
- selp.f32 %f266, %f169, %f168, %p18;
- bra.uni BB0_17;
-
- BB0_15:
- add.f32 %f146, %f22, %f22;
- mul.f32 %f147, %f146, 0f3FB8AA3B;
- cvt.rzi.f32.f32 %f148, %f147;
- mov.f32 %f149, 0fBF317200;
- fma.rn.f32 %f150, %f148, %f149, %f146;
- mov.f32 %f151, 0fB5BFBE8E;
- fma.rn.f32 %f152, %f148, %f151, %f150;
- mul.f32 %f153, %f152, 0f3FB8AA3B;
- ex2.approx.ftz.f32 %f154, %f153;
- ex2.approx.f32 %f155, %f148;
- mov.f32 %f156, 0f3F800000;
- fma.rn.f32 %f145, %f154, %f155, %f156;
- // inline asm
- rcp.approx.ftz.f32 %f144,%f145;
- // inline asm
- mov.f32 %f157, 0fC0000000;
- fma.rn.f32 %f158, %f144, %f157, %f156;
- mov.b32 %r27, %f158;
- setp.ltu.f32 %p17, %f22, 0f42B00000;
- selp.b32 %r28, %r27, 1065353216, %p17;
- mov.b32 %r29, %f21;
- and.b32 %r30, %r29, -2147483648;
- or.b32 %r31, %r28, %r30;
- mov.b32 %f266, %r31;
-
- BB0_17:
- st.global.f32 [%rd2+16384], %f266;
- ld.global.nc.f32 %f170, [%rd3+16384];
- sub.f32 %f171, %f266, %f170;
- st.global.f32 [%rd4+16384], %f171;
- ld.global.nc.f32 %f26, [%rd1+20480];
- abs.f32 %f27, %f26;
- setp.ltu.f32 %p19, %f27, 0f3F0CCCCD;
- @%p19 bra BB0_19;
- bra.uni BB0_18;
-
- BB0_19:
- mul.f32 %f187, %f26, %f26;
- mov.f32 %f188, 0fBD57BE66;
- mov.f32 %f189, 0f3C86A81B;
- fma.rn.f32 %f190, %f189, %f187, %f188;
- mov.f32 %f191, 0f3E08677B;
- fma.rn.f32 %f192, %f190, %f187, %f191;
- mov.f32 %f193, 0fBEAAAA29;
- fma.rn.f32 %f194, %f192, %f187, %f193;
- mul.f32 %f195, %f187, %f194;
- fma.rn.f32 %f196, %f195, %f26, %f26;
- add.f32 %f197, %f26, %f26;
- setp.eq.f32 %p21, %f26, 0f00000000;
- selp.f32 %f267, %f197, %f196, %p21;
- bra.uni BB0_20;
-
- BB0_18:
- add.f32 %f174, %f27, %f27;
- mul.f32 %f175, %f174, 0f3FB8AA3B;
- cvt.rzi.f32.f32 %f176, %f175;
- mov.f32 %f177, 0fBF317200;
- fma.rn.f32 %f178, %f176, %f177, %f174;
- mov.f32 %f179, 0fB5BFBE8E;
- fma.rn.f32 %f180, %f176, %f179, %f178;
- mul.f32 %f181, %f180, 0f3FB8AA3B;
- ex2.approx.ftz.f32 %f182, %f181;
- ex2.approx.f32 %f183, %f176;
- mov.f32 %f184, 0f3F800000;
- fma.rn.f32 %f173, %f182, %f183, %f184;
- // inline asm
- rcp.approx.ftz.f32 %f172,%f173;
- // inline asm
- mov.f32 %f185, 0fC0000000;
- fma.rn.f32 %f186, %f172, %f185, %f184;
- mov.b32 %r32, %f186;
- setp.ltu.f32 %p20, %f27, 0f42B00000;
- selp.b32 %r33, %r32, 1065353216, %p20;
- mov.b32 %r34, %f26;
- and.b32 %r35, %r34, -2147483648;
- or.b32 %r36, %r33, %r35;
- mov.b32 %f267, %r36;
-
- BB0_20:
- st.global.f32 [%rd2+20480], %f267;
- ld.global.nc.f32 %f198, [%rd3+20480];
- sub.f32 %f199, %f267, %f198;
- st.global.f32 [%rd4+20480], %f199;
- bar.sync 0;
- ld.global.f32 %f200, [%rd4];
- abs.f32 %f201, %f200;
- add.f32 %f202, %f201, 0f00000000;
- sub.f32 %f203, %f202, %f201;
- ld.global.f32 %f204, [%rd4+4096];
- abs.f32 %f205, %f204;
- sub.f32 %f206, %f205, %f203;
- add.f32 %f207, %f202, %f206;
- sub.f32 %f208, %f207, %f202;
- sub.f32 %f209, %f208, %f206;
- ld.global.f32 %f210, [%rd4+8192];
- abs.f32 %f211, %f210;
- sub.f32 %f212, %f211, %f209;
- add.f32 %f213, %f207, %f212;
- sub.f32 %f214, %f213, %f207;
- sub.f32 %f215, %f214, %f212;
- ld.global.f32 %f216, [%rd4+12288];
- abs.f32 %f217, %f216;
- sub.f32 %f218, %f217, %f215;
- add.f32 %f219, %f213, %f218;
- sub.f32 %f220, %f219, %f213;
- sub.f32 %f221, %f220, %f218;
- ld.global.f32 %f222, [%rd4+16384];
- abs.f32 %f223, %f222;
- sub.f32 %f224, %f223, %f221;
- add.f32 %f225, %f219, %f224;
- sub.f32 %f226, %f225, %f219;
- sub.f32 %f227, %f226, %f224;
- ld.global.f32 %f228, [%rd4+20480];
- abs.f32 %f229, %f228;
- sub.f32 %f230, %f229, %f227;
- add.f32 %f231, %f225, %f230;
- mov.u32 %r37, %tid.y;
- mov.u32 %r38, %ntid.x;
- mad.lo.s32 %r3, %r37, %r38, %r1;
- shl.b32 %r39, %r3, 2;
- mov.u32 %r40, _ZZ61Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0E8red_buf0;
- add.s32 %r4, %r40, %r39;
- st.shared.f32 [%r4], %f231;
- bar.sync 0;
- setp.gt.s32 %p22, %r3, 511;
- @%p22 bra BB0_22;
-
- ld.shared.f32 %f232, [%r4];
- ld.shared.f32 %f233, [%r4+2048];
- add.f32 %f234, %f232, %f233;
- st.shared.f32 [%r4], %f234;
-
- BB0_22:
- bar.sync 0;
- setp.gt.s32 %p23, %r3, 255;
- @%p23 bra BB0_24;
-
- ld.shared.f32 %f235, [%r4];
- ld.shared.f32 %f236, [%r4+1024];
- add.f32 %f237, %f235, %f236;
- st.shared.f32 [%r4], %f237;
-
- BB0_24:
- bar.sync 0;
- setp.gt.s32 %p24, %r3, 127;
- @%p24 bra BB0_26;
-
- ld.shared.f32 %f238, [%r4];
- ld.shared.f32 %f239, [%r4+512];
- add.f32 %f240, %f238, %f239;
- st.shared.f32 [%r4], %f240;
-
- BB0_26:
- bar.sync 0;
- setp.gt.s32 %p25, %r3, 63;
- @%p25 bra BB0_28;
-
- ld.shared.f32 %f241, [%r4];
- ld.shared.f32 %f242, [%r4+256];
- add.f32 %f243, %f241, %f242;
- st.shared.f32 [%r4], %f243;
-
- BB0_28:
- bar.sync 0;
- setp.gt.s32 %p26, %r3, 31;
- @%p26 bra BB0_30;
-
- ld.shared.f32 %f244, [%r4];
- ld.shared.f32 %f245, [%r4+128];
- add.f32 %f246, %f244, %f245;
- st.shared.f32 [%r4], %f246;
-
- BB0_30:
- setp.lt.s32 %p1, %r3, 32;
- bar.sync 0;
- @!%p1 bra BB0_33;
- bra.uni BB0_31;
-
- BB0_31:
- ld.shared.f32 %f247, [%r4];
- mov.b32 %r41, %f247;
- mov.u32 %r42, 2;
- mov.u32 %r43, 31;
- mov.u32 %r44, 16;
- mov.u32 %r45, -1;
- shfl.sync.down.b32 %r46|%p27, %r41, %r44, %r43, %r45;
- mov.b32 %f248, %r46;
- add.f32 %f249, %f247, %f248;
- mov.b32 %r47, %f249;
- mov.u32 %r48, 8;
- shfl.sync.down.b32 %r49|%p28, %r47, %r48, %r43, %r45;
- mov.b32 %f250, %r49;
- add.f32 %f251, %f249, %f250;
- mov.b32 %r50, %f251;
- mov.u32 %r51, 4;
- shfl.sync.down.b32 %r52|%p29, %r50, %r51, %r43, %r45;
- mov.b32 %f252, %r52;
- add.f32 %f253, %f251, %f252;
- mov.b32 %r53, %f253;
- shfl.sync.down.b32 %r54|%p30, %r53, %r42, %r43, %r45;
- mov.b32 %f254, %r54;
- add.f32 %f255, %f253, %f254;
- mov.b32 %r55, %f255;
- mov.u32 %r56, 1;
- shfl.sync.down.b32 %r57|%p31, %r55, %r56, %r43, %r45;
- mov.b32 %f256, %r57;
- add.f32 %f31, %f255, %f256;
- setp.ne.s32 %p32, %r3, 0;
- @%p32 bra BB0_33;
-
- st.shared.f32 [_ZZ61Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0E8red_buf0], %f31;
-
- BB0_33:
- bar.sync 0;
- setp.ne.s32 %p33, %r3, 0;
- @%p33 bra BB0_35;
-
- ld.shared.f32 %f257, [_ZZ61Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0E16T_abs_red_shared];
- ld.shared.f32 %f258, [_ZZ61Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0E8red_buf0];
- add.f32 %f259, %f257, %f258;
- st.shared.f32 [_ZZ61Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0E16T_abs_red_shared], %f259;
-
- BB0_35:
- setp.eq.s32 %p2, %r1, 0;
- bar.sync 0;
- @!%p2 bra BB0_37;
- bra.uni BB0_36;
-
- BB0_36:
- ld.shared.f32 %f260, [_ZZ61Fused_Tanh_Sub_Abs_ReduceSum_split_862706807055658019_kernel0E16T_abs_red_shared];
- cvta.to.global.u64 %rd20, %rd10;
- atom.global.add.f32 %f261, [%rd20], %f260;
-
- BB0_37:
- bar.sync 0;
- ret;
- }
-
|