|
- //
- // Generated by NVIDIA NVVM Compiler
- //
- // Compiler Build ID: CL-26907403
- // Cuda compilation tools, release 10.1, V10.1.243
- // Based on LLVM 3.4svn
- //
-
- .version 6.4
- .target sm_60
- .address_size 64
-
- // .globl Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0
-
- .visible .entry Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0(
- .param .u64 Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_0,
- .param .u64 Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_1,
- .param .u64 Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_2,
- .param .u64 Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_3,
- .param .u64 Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_4,
- .param .u64 Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_5,
- .param .u64 Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_6
- )
- {
- .reg .pred %p<81>;
- .reg .f32 %f<215>;
- .reg .b32 %r<226>;
- .reg .b64 %rd<94>;
-
-
- ld.param.u64 %rd30, [Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_0];
- ld.param.u64 %rd31, [Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_1];
- ld.param.u64 %rd32, [Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_2];
- ld.param.u64 %rd33, [Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_3];
- ld.param.u64 %rd34, [Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_4];
- ld.param.u64 %rd35, [Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_5];
- ld.param.u64 %rd36, [Fused_BroadcastTo_Sub_Mul_BroadcastTo_Sub_Mul_BroadcastTo_Reshape_BroadcastTo_Mu_more_split_7682864833902721405_kernel0_param_6];
- cvta.to.global.u64 %rd1, %rd33;
- cvta.to.global.u64 %rd2, %rd35;
- cvta.to.global.u64 %rd3, %rd36;
- cvta.to.global.u64 %rd4, %rd31;
- cvta.to.global.u64 %rd5, %rd30;
- cvta.to.global.u64 %rd6, %rd34;
- cvta.to.global.u64 %rd7, %rd32;
- mov.u32 %r1, %tid.x;
- mov.u32 %r2, %ctaid.x;
- setp.gt.s32 %p1, %r1, 111;
- @%p1 bra BB0_3;
-
- shl.b32 %r97, %r1, 2;
- mad.lo.s32 %r98, %r2, 448, %r97;
- mul.wide.s32 %rd37, %r98, 4;
- add.s64 %rd38, %rd7, %rd37;
- ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd38];
- add.f32 %f9, %f1, 0fBF800000;
- add.f32 %f10, %f2, 0fBF800000;
- add.f32 %f11, %f3, 0fBF800000;
- add.f32 %f12, %f4, 0fBF800000;
- add.s64 %rd39, %rd6, %rd37;
- mul.f32 %f13, %f12, %f12;
- mul.f32 %f14, %f11, %f11;
- mul.f32 %f15, %f10, %f10;
- mul.f32 %f16, %f9, %f9;
- st.global.v4.f32 [%rd39], {%f16, %f15, %f14, %f13};
- or.b32 %r99, %r2, %r1;
- setp.ne.s32 %p2, %r99, 0;
- @%p2 bra BB0_3;
-
- ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd7+3584];
- add.f32 %f25, %f17, 0fBF800000;
- add.f32 %f26, %f18, 0fBF800000;
- add.f32 %f27, %f19, 0fBF800000;
- add.f32 %f28, %f20, 0fBF800000;
- mul.f32 %f29, %f28, %f28;
- mul.f32 %f30, %f27, %f27;
- mul.f32 %f31, %f26, %f26;
- mul.f32 %f32, %f25, %f25;
- st.global.v4.f32 [%rd6+3584], {%f32, %f31, %f30, %f29};
-
- BB0_3:
- shl.b32 %r3, %r2, 1;
- mov.u32 %r101, 1;
- mov.u32 %r102, 4;
- sub.s32 %r103, %r102, %r3;
- min.s32 %r4, %r101, %r103;
- mul.lo.s32 %r5, %r2, 448;
- add.s32 %r7, %r1, %r5;
- mul.wide.s32 %rd40, %r7, 4;
- add.s64 %rd8, %rd4, %rd40;
- add.s64 %rd9, %rd1, %rd40;
- setp.lt.s32 %p3, %r4, 0;
- @%p3 bra BB0_26;
-
- add.s32 %r8, %r4, 1;
- and.b32 %r107, %r8, 3;
- mov.u32 %r196, 0;
- setp.eq.s32 %p4, %r107, 0;
- @%p4 bra BB0_15;
-
- setp.eq.s32 %p5, %r107, 1;
- @%p5 bra BB0_12;
-
- setp.eq.s32 %p6, %r107, 2;
- @%p6 bra BB0_9;
-
- mov.u32 %r196, 1;
- setp.gt.s32 %p7, %r7, 899;
- @%p7 bra BB0_9;
-
- ld.global.nc.f32 %f33, [%rd8];
- add.f32 %f34, %f33, 0fBF800000;
- mul.f32 %f35, %f34, %f34;
- st.global.f32 [%rd9], %f35;
-
- BB0_9:
- neg.s32 %r110, %r196;
- and.b32 %r111, %r110, 224;
- add.s32 %r112, %r1, %r111;
- add.s32 %r10, %r112, %r5;
- setp.gt.s32 %p8, %r10, 899;
- @%p8 bra BB0_11;
-
- mul.wide.s32 %rd41, %r10, 4;
- add.s64 %rd42, %rd4, %rd41;
- ld.global.nc.f32 %f36, [%rd42];
- add.f32 %f37, %f36, 0fBF800000;
- mul.f32 %f38, %f37, %f37;
- add.s64 %rd43, %rd1, %rd41;
- st.global.f32 [%rd43], %f38;
-
- BB0_11:
- add.s32 %r196, %r196, 1;
-
- BB0_12:
- mad.lo.s32 %r113, %r196, 224, %r1;
- add.s32 %r13, %r113, %r5;
- setp.gt.s32 %p9, %r13, 899;
- @%p9 bra BB0_14;
-
- mul.wide.s32 %rd44, %r13, 4;
- add.s64 %rd45, %rd4, %rd44;
- ld.global.nc.f32 %f39, [%rd45];
- add.f32 %f40, %f39, 0fBF800000;
- mul.f32 %f41, %f40, %f40;
- add.s64 %rd46, %rd1, %rd44;
- st.global.f32 [%rd46], %f41;
-
- BB0_14:
- add.s32 %r196, %r196, 1;
-
- BB0_15:
- setp.lt.u32 %p10, %r8, 4;
- @%p10 bra BB0_26;
-
- add.s32 %r200, %r196, -1;
- mad.lo.s32 %r114, %r2, 448, %r1;
- mad.lo.s32 %r199, %r196, 224, %r114;
-
- BB0_17:
- mul.wide.s32 %rd47, %r199, 4;
- add.s64 %rd10, %rd4, %rd47;
- add.s64 %rd11, %rd1, %rd47;
- setp.gt.s32 %p11, %r199, 899;
- @%p11 bra BB0_19;
-
- ld.global.nc.f32 %f42, [%rd10];
- add.f32 %f43, %f42, 0fBF800000;
- mul.f32 %f44, %f43, %f43;
- st.global.f32 [%rd11], %f44;
-
- BB0_19:
- add.s32 %r115, %r199, 224;
- setp.gt.s32 %p12, %r115, 899;
- @%p12 bra BB0_21;
-
- ld.global.nc.f32 %f45, [%rd10+896];
- add.f32 %f46, %f45, 0fBF800000;
- mul.f32 %f47, %f46, %f46;
- st.global.f32 [%rd11+896], %f47;
-
- BB0_21:
- add.s32 %r116, %r199, 448;
- setp.gt.s32 %p13, %r116, 899;
- @%p13 bra BB0_23;
-
- ld.global.nc.f32 %f48, [%rd10+1792];
- add.f32 %f49, %f48, 0fBF800000;
- mul.f32 %f50, %f49, %f49;
- st.global.f32 [%rd11+1792], %f50;
-
- BB0_23:
- add.s32 %r117, %r199, 672;
- setp.gt.s32 %p14, %r117, 899;
- @%p14 bra BB0_25;
-
- ld.global.nc.f32 %f51, [%rd10+2688];
- add.f32 %f52, %f51, 0fBF800000;
- mul.f32 %f53, %f52, %f52;
- st.global.f32 [%rd11+2688], %f53;
-
- BB0_25:
- add.s32 %r200, %r200, 4;
- add.s32 %r199, %r199, 896;
- setp.lt.s32 %p15, %r200, %r4;
- @%p15 bra BB0_17;
-
- BB0_26:
- neg.s32 %r118, %r3;
- min.s32 %r22, %r101, %r118;
- setp.lt.s32 %p16, %r22, 0;
- @%p16 bra BB0_49;
-
- add.s32 %r23, %r1, 896;
- add.s32 %r24, %r22, 1;
- and.b32 %r123, %r24, 3;
- mov.u32 %r201, 0;
- setp.eq.s32 %p17, %r123, 0;
- @%p17 bra BB0_38;
-
- setp.eq.s32 %p18, %r123, 1;
- @%p18 bra BB0_35;
-
- setp.eq.s32 %p19, %r123, 2;
- @%p19 bra BB0_32;
-
- add.s32 %r125, %r23, %r5;
- mov.u32 %r201, 1;
- setp.gt.s32 %p20, %r125, 899;
- @%p20 bra BB0_32;
-
- ld.global.nc.f32 %f54, [%rd8+3584];
- add.f32 %f55, %f54, 0fBF800000;
- mul.f32 %f56, %f55, %f55;
- st.global.f32 [%rd9+3584], %f56;
-
- BB0_32:
- neg.s32 %r127, %r201;
- and.b32 %r128, %r127, 224;
- add.s32 %r129, %r23, %r128;
- add.s32 %r26, %r129, %r5;
- setp.gt.s32 %p21, %r26, 899;
- @%p21 bra BB0_34;
-
- mul.wide.s32 %rd48, %r26, 4;
- add.s64 %rd49, %rd4, %rd48;
- ld.global.nc.f32 %f57, [%rd49];
- add.f32 %f58, %f57, 0fBF800000;
- mul.f32 %f59, %f58, %f58;
- add.s64 %rd50, %rd1, %rd48;
- st.global.f32 [%rd50], %f59;
-
- BB0_34:
- add.s32 %r201, %r201, 1;
-
- BB0_35:
- mad.lo.s32 %r130, %r201, 224, %r23;
- add.s32 %r29, %r130, %r5;
- setp.gt.s32 %p22, %r29, 899;
- @%p22 bra BB0_37;
-
- mul.wide.s32 %rd51, %r29, 4;
- add.s64 %rd52, %rd4, %rd51;
- ld.global.nc.f32 %f60, [%rd52];
- add.f32 %f61, %f60, 0fBF800000;
- mul.f32 %f62, %f61, %f61;
- add.s64 %rd53, %rd1, %rd51;
- st.global.f32 [%rd53], %f62;
-
- BB0_37:
- add.s32 %r201, %r201, 1;
-
- BB0_38:
- setp.lt.u32 %p23, %r24, 4;
- @%p23 bra BB0_49;
-
- add.s32 %r205, %r201, -1;
- mad.lo.s32 %r131, %r2, 448, %r1;
- mad.lo.s32 %r204, %r201, 224, %r131;
-
- BB0_40:
- add.s32 %r132, %r204, 896;
- mul.wide.s32 %rd54, %r132, 4;
- add.s64 %rd12, %rd4, %rd54;
- add.s64 %rd13, %rd1, %rd54;
- setp.gt.s32 %p24, %r132, 899;
- @%p24 bra BB0_42;
-
- ld.global.nc.f32 %f63, [%rd12];
- add.f32 %f64, %f63, 0fBF800000;
- mul.f32 %f65, %f64, %f64;
- st.global.f32 [%rd13], %f65;
-
- BB0_42:
- add.s32 %r133, %r204, 1120;
- setp.gt.s32 %p25, %r133, 899;
- @%p25 bra BB0_44;
-
- ld.global.nc.f32 %f66, [%rd12+896];
- add.f32 %f67, %f66, 0fBF800000;
- mul.f32 %f68, %f67, %f67;
- st.global.f32 [%rd13+896], %f68;
-
- BB0_44:
- add.s32 %r134, %r204, 1344;
- setp.gt.s32 %p26, %r134, 899;
- @%p26 bra BB0_46;
-
- ld.global.nc.f32 %f69, [%rd12+1792];
- add.f32 %f70, %f69, 0fBF800000;
- mul.f32 %f71, %f70, %f70;
- st.global.f32 [%rd13+1792], %f71;
-
- BB0_46:
- add.s32 %r135, %r204, 1568;
- setp.gt.s32 %p27, %r135, 899;
- @%p27 bra BB0_48;
-
- ld.global.nc.f32 %f72, [%rd12+2688];
- add.f32 %f73, %f72, 0fBF800000;
- mul.f32 %f74, %f73, %f73;
- st.global.f32 [%rd13+2688], %f74;
-
- BB0_48:
- add.s32 %r205, %r205, 4;
- setp.lt.s32 %p28, %r205, %r22;
- mov.u32 %r204, %r132;
- @%p28 bra BB0_40;
-
- BB0_49:
- add.s64 %rd14, %rd5, %rd40;
- add.s64 %rd15, %rd7, %rd40;
- add.s64 %rd16, %rd2, %rd40;
- @%p3 bra BB0_72;
-
- add.s32 %r39, %r4, 1;
- and.b32 %r139, %r39, 3;
- mov.u32 %r206, 0;
- setp.eq.s32 %p30, %r139, 0;
- @%p30 bra BB0_61;
-
- setp.eq.s32 %p31, %r139, 1;
- @%p31 bra BB0_58;
-
- setp.eq.s32 %p32, %r139, 2;
- @%p32 bra BB0_55;
-
- mov.u32 %r206, 1;
- setp.gt.s32 %p33, %r7, 899;
- @%p33 bra BB0_55;
-
- ld.global.nc.f32 %f75, [%rd14];
- ld.global.nc.f32 %f76, [%rd15];
- add.f32 %f77, %f76, 0fBF800000;
- mul.f32 %f78, %f77, 0f3A91A2B3;
- mul.f32 %f79, %f75, %f78;
- st.global.f32 [%rd16], %f79;
-
- BB0_55:
- neg.s32 %r142, %r206;
- and.b32 %r143, %r142, 224;
- add.s32 %r144, %r1, %r143;
- add.s32 %r41, %r144, %r5;
- setp.gt.s32 %p34, %r41, 899;
- @%p34 bra BB0_57;
-
- mul.wide.s32 %rd57, %r41, 4;
- add.s64 %rd58, %rd5, %rd57;
- add.s64 %rd59, %rd7, %rd57;
- ld.global.nc.f32 %f80, [%rd59];
- add.f32 %f81, %f80, 0fBF800000;
- mul.f32 %f82, %f81, 0f3A91A2B3;
- ld.global.nc.f32 %f83, [%rd58];
- mul.f32 %f84, %f83, %f82;
- add.s64 %rd60, %rd2, %rd57;
- st.global.f32 [%rd60], %f84;
-
- BB0_57:
- add.s32 %r206, %r206, 1;
-
- BB0_58:
- mad.lo.s32 %r145, %r206, 224, %r1;
- add.s32 %r44, %r145, %r5;
- setp.gt.s32 %p35, %r44, 899;
- @%p35 bra BB0_60;
-
- mul.wide.s32 %rd61, %r44, 4;
- add.s64 %rd62, %rd5, %rd61;
- add.s64 %rd63, %rd7, %rd61;
- ld.global.nc.f32 %f85, [%rd63];
- add.f32 %f86, %f85, 0fBF800000;
- mul.f32 %f87, %f86, 0f3A91A2B3;
- ld.global.nc.f32 %f88, [%rd62];
- mul.f32 %f89, %f88, %f87;
- add.s64 %rd64, %rd2, %rd61;
- st.global.f32 [%rd64], %f89;
-
- BB0_60:
- add.s32 %r206, %r206, 1;
-
- BB0_61:
- setp.lt.u32 %p36, %r39, 4;
- @%p36 bra BB0_72;
-
- add.s32 %r210, %r206, -1;
- mad.lo.s32 %r146, %r2, 448, %r1;
- mad.lo.s32 %r209, %r206, 224, %r146;
-
- BB0_63:
- mul.wide.s32 %rd65, %r209, 4;
- add.s64 %rd17, %rd5, %rd65;
- add.s64 %rd18, %rd7, %rd65;
- add.s64 %rd19, %rd2, %rd65;
- setp.gt.s32 %p37, %r209, 899;
- @%p37 bra BB0_65;
-
- ld.global.nc.f32 %f90, [%rd17];
- ld.global.nc.f32 %f91, [%rd18];
- add.f32 %f92, %f91, 0fBF800000;
- mul.f32 %f93, %f92, 0f3A91A2B3;
- mul.f32 %f94, %f90, %f93;
- st.global.f32 [%rd19], %f94;
-
- BB0_65:
- add.s32 %r147, %r209, 224;
- setp.gt.s32 %p38, %r147, 899;
- @%p38 bra BB0_67;
-
- ld.global.nc.f32 %f95, [%rd18+896];
- add.f32 %f96, %f95, 0fBF800000;
- mul.f32 %f97, %f96, 0f3A91A2B3;
- ld.global.nc.f32 %f98, [%rd17+896];
- mul.f32 %f99, %f98, %f97;
- st.global.f32 [%rd19+896], %f99;
-
- BB0_67:
- add.s32 %r148, %r209, 448;
- setp.gt.s32 %p39, %r148, 899;
- @%p39 bra BB0_69;
-
- ld.global.nc.f32 %f100, [%rd18+1792];
- add.f32 %f101, %f100, 0fBF800000;
- mul.f32 %f102, %f101, 0f3A91A2B3;
- ld.global.nc.f32 %f103, [%rd17+1792];
- mul.f32 %f104, %f103, %f102;
- st.global.f32 [%rd19+1792], %f104;
-
- BB0_69:
- add.s32 %r149, %r209, 672;
- setp.gt.s32 %p40, %r149, 899;
- @%p40 bra BB0_71;
-
- ld.global.nc.f32 %f105, [%rd18+2688];
- add.f32 %f106, %f105, 0fBF800000;
- mul.f32 %f107, %f106, 0f3A91A2B3;
- ld.global.nc.f32 %f108, [%rd17+2688];
- mul.f32 %f109, %f108, %f107;
- st.global.f32 [%rd19+2688], %f109;
-
- BB0_71:
- add.s32 %r210, %r210, 4;
- add.s32 %r209, %r209, 896;
- setp.lt.s32 %p41, %r210, %r4;
- @%p41 bra BB0_63;
-
- BB0_72:
- @%p16 bra BB0_95;
-
- add.s32 %r53, %r1, 896;
- add.s32 %r54, %r22, 1;
- and.b32 %r153, %r54, 3;
- mov.u32 %r211, 0;
- setp.eq.s32 %p43, %r153, 0;
- @%p43 bra BB0_84;
-
- setp.eq.s32 %p44, %r153, 1;
- @%p44 bra BB0_81;
-
- setp.eq.s32 %p45, %r153, 2;
- @%p45 bra BB0_78;
-
- add.s32 %r155, %r53, %r5;
- mov.u32 %r211, 1;
- setp.gt.s32 %p46, %r155, 899;
- @%p46 bra BB0_78;
-
- ld.global.nc.f32 %f110, [%rd15+3584];
- add.f32 %f111, %f110, 0fBF800000;
- mul.f32 %f112, %f111, 0f3A91A2B3;
- ld.global.nc.f32 %f113, [%rd14+3584];
- mul.f32 %f114, %f113, %f112;
- st.global.f32 [%rd16+3584], %f114;
-
- BB0_78:
- neg.s32 %r157, %r211;
- and.b32 %r158, %r157, 224;
- add.s32 %r159, %r53, %r158;
- add.s32 %r56, %r159, %r5;
- setp.gt.s32 %p47, %r56, 899;
- @%p47 bra BB0_80;
-
- mul.wide.s32 %rd66, %r56, 4;
- add.s64 %rd67, %rd5, %rd66;
- add.s64 %rd68, %rd7, %rd66;
- ld.global.nc.f32 %f115, [%rd68];
- add.f32 %f116, %f115, 0fBF800000;
- mul.f32 %f117, %f116, 0f3A91A2B3;
- ld.global.nc.f32 %f118, [%rd67];
- mul.f32 %f119, %f118, %f117;
- add.s64 %rd69, %rd2, %rd66;
- st.global.f32 [%rd69], %f119;
-
- BB0_80:
- add.s32 %r211, %r211, 1;
-
- BB0_81:
- mad.lo.s32 %r160, %r211, 224, %r53;
- add.s32 %r59, %r160, %r5;
- setp.gt.s32 %p48, %r59, 899;
- @%p48 bra BB0_83;
-
- mul.wide.s32 %rd70, %r59, 4;
- add.s64 %rd71, %rd5, %rd70;
- add.s64 %rd72, %rd7, %rd70;
- ld.global.nc.f32 %f120, [%rd72];
- add.f32 %f121, %f120, 0fBF800000;
- mul.f32 %f122, %f121, 0f3A91A2B3;
- ld.global.nc.f32 %f123, [%rd71];
- mul.f32 %f124, %f123, %f122;
- add.s64 %rd73, %rd2, %rd70;
- st.global.f32 [%rd73], %f124;
-
- BB0_83:
- add.s32 %r211, %r211, 1;
-
- BB0_84:
- setp.lt.u32 %p49, %r54, 4;
- @%p49 bra BB0_95;
-
- add.s32 %r215, %r211, -1;
- mad.lo.s32 %r161, %r2, 448, %r1;
- mad.lo.s32 %r214, %r211, 224, %r161;
-
- BB0_86:
- add.s32 %r162, %r214, 896;
- mul.wide.s32 %rd74, %r162, 4;
- add.s64 %rd20, %rd5, %rd74;
- add.s64 %rd21, %rd7, %rd74;
- add.s64 %rd22, %rd2, %rd74;
- setp.gt.s32 %p50, %r162, 899;
- @%p50 bra BB0_88;
-
- ld.global.nc.f32 %f125, [%rd20];
- ld.global.nc.f32 %f126, [%rd21];
- add.f32 %f127, %f126, 0fBF800000;
- mul.f32 %f128, %f127, 0f3A91A2B3;
- mul.f32 %f129, %f125, %f128;
- st.global.f32 [%rd22], %f129;
-
- BB0_88:
- add.s32 %r163, %r214, 1120;
- setp.gt.s32 %p51, %r163, 899;
- @%p51 bra BB0_90;
-
- ld.global.nc.f32 %f130, [%rd21+896];
- add.f32 %f131, %f130, 0fBF800000;
- mul.f32 %f132, %f131, 0f3A91A2B3;
- ld.global.nc.f32 %f133, [%rd20+896];
- mul.f32 %f134, %f133, %f132;
- st.global.f32 [%rd22+896], %f134;
-
- BB0_90:
- add.s32 %r164, %r214, 1344;
- setp.gt.s32 %p52, %r164, 899;
- @%p52 bra BB0_92;
-
- ld.global.nc.f32 %f135, [%rd21+1792];
- add.f32 %f136, %f135, 0fBF800000;
- mul.f32 %f137, %f136, 0f3A91A2B3;
- ld.global.nc.f32 %f138, [%rd20+1792];
- mul.f32 %f139, %f138, %f137;
- st.global.f32 [%rd22+1792], %f139;
-
- BB0_92:
- add.s32 %r165, %r214, 1568;
- setp.gt.s32 %p53, %r165, 899;
- @%p53 bra BB0_94;
-
- ld.global.nc.f32 %f140, [%rd21+2688];
- add.f32 %f141, %f140, 0fBF800000;
- mul.f32 %f142, %f141, 0f3A91A2B3;
- ld.global.nc.f32 %f143, [%rd20+2688];
- mul.f32 %f144, %f143, %f142;
- st.global.f32 [%rd22+2688], %f144;
-
- BB0_94:
- add.s32 %r215, %r215, 4;
- setp.lt.s32 %p54, %r215, %r22;
- mov.u32 %r214, %r162;
- @%p54 bra BB0_86;
-
- BB0_95:
- add.s64 %rd23, %rd3, %rd40;
- @%p3 bra BB0_118;
-
- add.s32 %r68, %r4, 1;
- and.b32 %r169, %r68, 3;
- mov.u32 %r216, 0;
- setp.eq.s32 %p56, %r169, 0;
- @%p56 bra BB0_107;
-
- setp.eq.s32 %p57, %r169, 1;
- @%p57 bra BB0_104;
-
- setp.eq.s32 %p58, %r169, 2;
- @%p58 bra BB0_101;
-
- mov.u32 %r216, 1;
- setp.gt.s32 %p59, %r7, 899;
- @%p59 bra BB0_101;
-
- ld.global.nc.f32 %f145, [%rd14];
- ld.global.nc.f32 %f146, [%rd8];
- add.f32 %f147, %f146, 0fBF800000;
- mul.f32 %f148, %f147, 0f3A91A2B3;
- mul.f32 %f149, %f145, %f148;
- st.global.f32 [%rd23], %f149;
-
- BB0_101:
- neg.s32 %r172, %r216;
- and.b32 %r173, %r172, 224;
- add.s32 %r174, %r1, %r173;
- add.s32 %r70, %r174, %r5;
- setp.gt.s32 %p60, %r70, 899;
- @%p60 bra BB0_103;
-
- mul.wide.s32 %rd76, %r70, 4;
- add.s64 %rd77, %rd5, %rd76;
- add.s64 %rd78, %rd4, %rd76;
- ld.global.nc.f32 %f150, [%rd78];
- add.f32 %f151, %f150, 0fBF800000;
- mul.f32 %f152, %f151, 0f3A91A2B3;
- ld.global.nc.f32 %f153, [%rd77];
- mul.f32 %f154, %f153, %f152;
- add.s64 %rd79, %rd3, %rd76;
- st.global.f32 [%rd79], %f154;
-
- BB0_103:
- add.s32 %r216, %r216, 1;
-
- BB0_104:
- mad.lo.s32 %r175, %r216, 224, %r1;
- add.s32 %r73, %r175, %r5;
- setp.gt.s32 %p61, %r73, 899;
- @%p61 bra BB0_106;
-
- mul.wide.s32 %rd80, %r73, 4;
- add.s64 %rd81, %rd5, %rd80;
- add.s64 %rd82, %rd4, %rd80;
- ld.global.nc.f32 %f155, [%rd82];
- add.f32 %f156, %f155, 0fBF800000;
- mul.f32 %f157, %f156, 0f3A91A2B3;
- ld.global.nc.f32 %f158, [%rd81];
- mul.f32 %f159, %f158, %f157;
- add.s64 %rd83, %rd3, %rd80;
- st.global.f32 [%rd83], %f159;
-
- BB0_106:
- add.s32 %r216, %r216, 1;
-
- BB0_107:
- setp.lt.u32 %p62, %r68, 4;
- @%p62 bra BB0_118;
-
- add.s32 %r220, %r216, -1;
- mad.lo.s32 %r176, %r2, 448, %r1;
- mad.lo.s32 %r219, %r216, 224, %r176;
-
- BB0_109:
- mul.wide.s32 %rd84, %r219, 4;
- add.s64 %rd24, %rd5, %rd84;
- add.s64 %rd25, %rd4, %rd84;
- add.s64 %rd26, %rd3, %rd84;
- setp.gt.s32 %p63, %r219, 899;
- @%p63 bra BB0_111;
-
- ld.global.nc.f32 %f160, [%rd24];
- ld.global.nc.f32 %f161, [%rd25];
- add.f32 %f162, %f161, 0fBF800000;
- mul.f32 %f163, %f162, 0f3A91A2B3;
- mul.f32 %f164, %f160, %f163;
- st.global.f32 [%rd26], %f164;
-
- BB0_111:
- add.s32 %r177, %r219, 224;
- setp.gt.s32 %p64, %r177, 899;
- @%p64 bra BB0_113;
-
- ld.global.nc.f32 %f165, [%rd25+896];
- add.f32 %f166, %f165, 0fBF800000;
- mul.f32 %f167, %f166, 0f3A91A2B3;
- ld.global.nc.f32 %f168, [%rd24+896];
- mul.f32 %f169, %f168, %f167;
- st.global.f32 [%rd26+896], %f169;
-
- BB0_113:
- add.s32 %r178, %r219, 448;
- setp.gt.s32 %p65, %r178, 899;
- @%p65 bra BB0_115;
-
- ld.global.nc.f32 %f170, [%rd25+1792];
- add.f32 %f171, %f170, 0fBF800000;
- mul.f32 %f172, %f171, 0f3A91A2B3;
- ld.global.nc.f32 %f173, [%rd24+1792];
- mul.f32 %f174, %f173, %f172;
- st.global.f32 [%rd26+1792], %f174;
-
- BB0_115:
- add.s32 %r179, %r219, 672;
- setp.gt.s32 %p66, %r179, 899;
- @%p66 bra BB0_117;
-
- ld.global.nc.f32 %f175, [%rd25+2688];
- add.f32 %f176, %f175, 0fBF800000;
- mul.f32 %f177, %f176, 0f3A91A2B3;
- ld.global.nc.f32 %f178, [%rd24+2688];
- mul.f32 %f179, %f178, %f177;
- st.global.f32 [%rd26+2688], %f179;
-
- BB0_117:
- add.s32 %r220, %r220, 4;
- add.s32 %r219, %r219, 896;
- setp.lt.s32 %p67, %r220, %r4;
- @%p67 bra BB0_109;
-
- BB0_118:
- @%p16 bra BB0_142;
-
- add.s32 %r82, %r1, 896;
- add.s32 %r83, %r22, 1;
- and.b32 %r183, %r83, 3;
- mov.u32 %r221, 0;
- setp.eq.s32 %p69, %r183, 0;
- @%p69 bra BB0_131;
-
- setp.eq.s32 %p70, %r183, 1;
- @%p70 bra BB0_128;
-
- setp.eq.s32 %p71, %r183, 2;
- @%p71 bra BB0_125;
-
- add.s32 %r185, %r82, %r5;
- setp.gt.s32 %p72, %r185, 899;
- @%p72 bra BB0_124;
-
- ld.global.nc.f32 %f180, [%rd8+3584];
- add.f32 %f181, %f180, 0fBF800000;
- mul.f32 %f182, %f181, 0f3A91A2B3;
- ld.global.nc.f32 %f183, [%rd14+3584];
- mul.f32 %f184, %f183, %f182;
- st.global.f32 [%rd23+3584], %f184;
-
- BB0_124:
- mov.u32 %r221, %r101;
-
- BB0_125:
- neg.s32 %r187, %r221;
- and.b32 %r188, %r187, 224;
- add.s32 %r189, %r82, %r188;
- add.s32 %r85, %r189, %r5;
- setp.gt.s32 %p73, %r85, 899;
- @%p73 bra BB0_127;
-
- mul.wide.s32 %rd85, %r85, 4;
- add.s64 %rd86, %rd5, %rd85;
- add.s64 %rd87, %rd4, %rd85;
- ld.global.nc.f32 %f185, [%rd87];
- add.f32 %f186, %f185, 0fBF800000;
- mul.f32 %f187, %f186, 0f3A91A2B3;
- ld.global.nc.f32 %f188, [%rd86];
- mul.f32 %f189, %f188, %f187;
- add.s64 %rd88, %rd3, %rd85;
- st.global.f32 [%rd88], %f189;
-
- BB0_127:
- add.s32 %r221, %r221, 1;
-
- BB0_128:
- mad.lo.s32 %r190, %r221, 224, %r82;
- add.s32 %r88, %r190, %r5;
- setp.gt.s32 %p74, %r88, 899;
- @%p74 bra BB0_130;
-
- mul.wide.s32 %rd89, %r88, 4;
- add.s64 %rd90, %rd5, %rd89;
- add.s64 %rd91, %rd4, %rd89;
- ld.global.nc.f32 %f190, [%rd91];
- add.f32 %f191, %f190, 0fBF800000;
- mul.f32 %f192, %f191, 0f3A91A2B3;
- ld.global.nc.f32 %f193, [%rd90];
- mul.f32 %f194, %f193, %f192;
- add.s64 %rd92, %rd3, %rd89;
- st.global.f32 [%rd92], %f194;
-
- BB0_130:
- add.s32 %r221, %r221, 1;
-
- BB0_131:
- setp.lt.u32 %p75, %r83, 4;
- @%p75 bra BB0_142;
-
- add.s32 %r225, %r221, -1;
- mad.lo.s32 %r191, %r2, 448, %r1;
- mad.lo.s32 %r224, %r221, 224, %r191;
-
- BB0_133:
- add.s32 %r192, %r224, 896;
- mul.wide.s32 %rd93, %r192, 4;
- add.s64 %rd27, %rd5, %rd93;
- add.s64 %rd28, %rd4, %rd93;
- add.s64 %rd29, %rd3, %rd93;
- setp.gt.s32 %p76, %r192, 899;
- @%p76 bra BB0_135;
-
- ld.global.nc.f32 %f195, [%rd27];
- ld.global.nc.f32 %f196, [%rd28];
- add.f32 %f197, %f196, 0fBF800000;
- mul.f32 %f198, %f197, 0f3A91A2B3;
- mul.f32 %f199, %f195, %f198;
- st.global.f32 [%rd29], %f199;
-
- BB0_135:
- add.s32 %r193, %r224, 1120;
- setp.gt.s32 %p77, %r193, 899;
- @%p77 bra BB0_137;
-
- ld.global.nc.f32 %f200, [%rd28+896];
- add.f32 %f201, %f200, 0fBF800000;
- mul.f32 %f202, %f201, 0f3A91A2B3;
- ld.global.nc.f32 %f203, [%rd27+896];
- mul.f32 %f204, %f203, %f202;
- st.global.f32 [%rd29+896], %f204;
-
- BB0_137:
- add.s32 %r194, %r224, 1344;
- setp.gt.s32 %p78, %r194, 899;
- @%p78 bra BB0_139;
-
- ld.global.nc.f32 %f205, [%rd28+1792];
- add.f32 %f206, %f205, 0fBF800000;
- mul.f32 %f207, %f206, 0f3A91A2B3;
- ld.global.nc.f32 %f208, [%rd27+1792];
- mul.f32 %f209, %f208, %f207;
- st.global.f32 [%rd29+1792], %f209;
-
- BB0_139:
- add.s32 %r195, %r224, 1568;
- setp.gt.s32 %p79, %r195, 899;
- @%p79 bra BB0_141;
-
- ld.global.nc.f32 %f210, [%rd28+2688];
- add.f32 %f211, %f210, 0fBF800000;
- mul.f32 %f212, %f211, 0f3A91A2B3;
- ld.global.nc.f32 %f213, [%rd27+2688];
- mul.f32 %f214, %f213, %f212;
- st.global.f32 [%rd29+2688], %f214;
-
- BB0_141:
- add.s32 %r225, %r225, 4;
- setp.lt.s32 %p80, %r225, %r22;
- mov.u32 %r224, %r192;
- @%p80 bra BB0_133;
-
- BB0_142:
- ret;
- }
-
|