Search in sources :

Example 31 with Label

use of org.graalvm.compiler.asm.Label in project graal by oracle.

the class AMD64MathIntrinsicBinaryOp method powIntrinsic.

public void powIntrinsic(Register dest, Register value1, Register value2, CompilationResultBuilder crb, AMD64MacroAssembler masm) {
    ArrayDataPointerConstant highSigMaskPtr = new ArrayDataPointerConstant(highSigMask, 16);
    ArrayDataPointerConstant logTwoEPtr = new ArrayDataPointerConstant(logTwoE, 16);
    ArrayDataPointerConstant highmaskYPtr = new ArrayDataPointerConstant(highmaskY, 16);
    ArrayDataPointerConstant tExpPtr = new ArrayDataPointerConstant(tExp, 16);
    ArrayDataPointerConstant eCoeffPtr = new ArrayDataPointerConstant(eCoeff, 16);
    ArrayDataPointerConstant coeffHPtr = new ArrayDataPointerConstant(coeffH, 16);
    ArrayDataPointerConstant highmaskLogXPtr = new ArrayDataPointerConstant(highmaskLogX, 16);
    ArrayDataPointerConstant halfmaskPtr = new ArrayDataPointerConstant(halfmask, 8);
    ArrayDataPointerConstant coeffPowPtr = new ArrayDataPointerConstant(coeffPow, 16);
    ArrayDataPointerConstant lTblPowPtr = new ArrayDataPointerConstant(lTblPow, 16);
    ArrayDataPointerConstant logTwoPowPtr = new ArrayDataPointerConstant(logTwoPow, 8);
    Label bb0 = new Label();
    Label bb1 = new Label();
    Label bb2 = new Label();
    Label bb3 = new Label();
    Label bb4 = new Label();
    Label bb5 = new Label();
    Label bb6 = new Label();
    Label bb7 = new Label();
    Label bb8 = new Label();
    Label bb9 = new Label();
    Label bb10 = new Label();
    Label bb11 = new Label();
    Label bb12 = new Label();
    Label bb13 = new Label();
    Label bb14 = new Label();
    Label bb15 = new Label();
    Label bb16 = new Label();
    Label bb18 = new Label();
    Label bb19 = new Label();
    Label bb20 = new Label();
    Label bb21 = new Label();
    Label bb22 = new Label();
    Label bb23 = new Label();
    Label bb24 = new Label();
    Label bb25 = new Label();
    Label bb26 = new Label();
    Label bb27 = new Label();
    Label bb28 = new Label();
    Label bb29 = new Label();
    Label bb30 = new Label();
    Label bb31 = new Label();
    Label bb32 = new Label();
    Label bb33 = new Label();
    Label bb34 = new Label();
    Label bb35 = new Label();
    Label bb36 = new Label();
    Label bb37 = new Label();
    Label bb38 = new Label();
    Label bb39 = new Label();
    Label bb40 = new Label();
    Label bb41 = new Label();
    Label bb42 = new Label();
    Label bb43 = new Label();
    Label bb44 = new Label();
    Label bb45 = new Label();
    Label bb46 = new Label();
    Label bb47 = new Label();
    Label bb48 = new Label();
    Label bb49 = new Label();
    Label bb50 = new Label();
    Label bb51 = new Label();
    Label bb53 = new Label();
    Label bb54 = new Label();
    Label bb55 = new Label();
    Label bb56 = new Label();
    Register gpr1 = asRegister(gpr1Temp, AMD64Kind.QWORD);
    Register gpr2 = asRegister(gpr2Temp, AMD64Kind.QWORD);
    Register gpr3 = asRegister(rcxTemp, AMD64Kind.QWORD);
    Register gpr4 = asRegister(gpr4Temp, AMD64Kind.QWORD);
    Register gpr5 = asRegister(gpr5Temp, AMD64Kind.QWORD);
    Register gpr6 = asRegister(gpr6Temp, AMD64Kind.QWORD);
    Register gpr7 = asRegister(gpr7Temp, AMD64Kind.QWORD);
    Register gpr8 = asRegister(gpr8Temp, AMD64Kind.QWORD);
    Register temp1 = asRegister(xmm1Temp, AMD64Kind.DOUBLE);
    Register temp2 = asRegister(xmm2Temp, AMD64Kind.DOUBLE);
    Register temp3 = asRegister(xmm3Temp, AMD64Kind.DOUBLE);
    Register temp4 = asRegister(xmm4Temp, AMD64Kind.DOUBLE);
    Register temp5 = asRegister(xmm5Temp, AMD64Kind.DOUBLE);
    Register temp6 = asRegister(xmm6Temp, AMD64Kind.DOUBLE);
    Register temp7 = asRegister(xmm7Temp, AMD64Kind.DOUBLE);
    Register temp8 = asRegister(xmm8Temp, AMD64Kind.DOUBLE);
    Register temp9 = asRegister(xmm9Temp, AMD64Kind.DOUBLE);
    Register temp10 = asRegister(xmm10Temp, AMD64Kind.DOUBLE);
    setCrb(crb);
    masm.movdqu(temp10, value1);
    masm.movsd(temp8, value2);
    if (dest.encoding != value1.encoding) {
        masm.movdqu(dest, value1);
    }
    // 0x00000000,
    masm.movq(temp9, externalAddress(logTwoEPtr));
    // 0x3ff72000
    masm.pextrw(gpr1, dest, 3);
    masm.xorpd(temp2, temp2);
    masm.movq(gpr2, 0x3ff0000000000000L);
    masm.movdq(temp2, gpr2);
    masm.movl(gpr5, 1069088768);
    masm.movdq(temp7, gpr5);
    masm.xorpd(temp1, temp1);
    masm.movq(gpr6, 0x77f0000000000000L);
    masm.movdq(temp1, gpr6);
    masm.movdqu(temp3, dest);
    masm.movl(gpr4, 32752);
    masm.andl(gpr4, gpr1);
    masm.subl(gpr4, 16368);
    masm.movl(gpr3, gpr4);
    masm.sarl(gpr4, 31);
    masm.addl(gpr3, gpr4);
    masm.xorl(gpr3, gpr4);
    masm.por(dest, temp2);
    // 0x00000000,
    masm.movdqu(temp6, externalAddress(highSigMaskPtr));
    // 0xfffff800,
    // 0x00000000,
    // 0xfffff800
    masm.psrlq(dest, 27);
    masm.psrld(dest, 2);
    masm.addl(gpr3, 16);
    masm.bsrl(gpr3, gpr3);
    masm.rcpps(dest, dest);
    masm.psllq(temp3, 12);
    masm.movl(gpr7, 8192);
    masm.movdq(temp4, gpr7);
    masm.psrlq(temp3, 12);
    masm.subl(gpr1, 16);
    masm.cmpl(gpr1, 32736);
    masm.jcc(ConditionFlag.AboveEqual, bb0);
    masm.movl(gpr5, 0);
    masm.bind(bb1);
    masm.mulss(dest, temp7);
    masm.movl(gpr4, -1);
    masm.subl(gpr3, 4);
    masm.shll(gpr4);
    masm.shlq(gpr4, 32);
    masm.movdq(temp5, gpr4);
    masm.por(temp3, temp1);
    masm.subl(gpr1, 16351);
    masm.cmpl(gpr1, 1);
    masm.jcc(ConditionFlag.BelowEqual, bb2);
    masm.paddd(dest, temp4);
    masm.pand(temp5, temp3);
    masm.movdl(gpr4, dest);
    masm.psllq(dest, 29);
    masm.bind(bb3);
    masm.subsd(temp3, temp5);
    masm.pand(dest, temp6);
    masm.subl(gpr1, 1);
    masm.sarl(gpr1, 4);
    masm.cvtsi2sdl(temp7, gpr1);
    masm.mulpd(temp5, dest);
    masm.bind(bb4);
    masm.mulsd(temp3, dest);
    masm.leaq(gpr8, externalAddress(coeffPowPtr));
    // 0x6dc96112,
    masm.movdqu(temp1, new AMD64Address(gpr8, 0));
    // 0xbf836578,
    // 0xee241472,
    // 0xbf9b0301
    // 0x9f95985a,
    masm.movdqu(temp4, new AMD64Address(gpr8, 16));
    // 0xbfb528db,
    // 0xb3841d2a,
    // 0xbfd619b6
    // 0x518775e3,
    masm.movdqu(temp6, new AMD64Address(gpr8, 32));
    // 0x3f9004f2,
    // 0xac8349bb,
    // 0x3fa76c9b
    // 0x486ececc,
    masm.movdqu(dest, new AMD64Address(gpr8, 48));
    // 0x3fc4635e,
    // 0x161bb241,
    // 0xbf5dabe1
    masm.subsd(temp5, temp9);
    masm.movl(gpr3, gpr1);
    masm.sarl(gpr1, 31);
    masm.addl(gpr3, gpr1);
    masm.xorl(gpr1, gpr3);
    masm.addl(gpr1, 1);
    masm.bsrl(gpr1, gpr1);
    masm.unpcklpd(temp5, temp3);
    masm.addsd(temp3, temp5);
    masm.leaq(gpr7, externalAddress(lTblPowPtr));
    masm.andl(gpr4, 16760832);
    masm.shrl(gpr4, 10);
    masm.addpd(temp5, new AMD64Address(gpr7, gpr4, Scale.Times1, -3648));
    masm.pshufd(temp2, temp3, 0x44);
    masm.mulsd(temp3, temp3);
    masm.mulpd(temp1, temp2);
    masm.mulpd(temp4, temp2);
    masm.addsd(temp5, temp7);
    masm.mulsd(temp2, temp3);
    masm.addpd(temp6, temp1);
    masm.mulsd(temp3, temp3);
    masm.addpd(dest, temp4);
    masm.movdqu(temp1, temp8);
    masm.pextrw(gpr3, temp8, 3);
    masm.pshufd(temp7, temp5, 0xEE);
    // 0x00000000,
    masm.movq(temp4, externalAddress(highmaskYPtr));
    // 0xfffffff8
    masm.mulpd(temp6, temp2);
    masm.pshufd(temp3, temp3, 0x44);
    masm.mulpd(dest, temp2);
    masm.shll(gpr1, 4);
    masm.subl(gpr1, 15872);
    masm.andl(gpr3, 32752);
    masm.addl(gpr1, gpr3);
    masm.mulpd(temp3, temp6);
    masm.cmpl(gpr1, 624);
    masm.jcc(ConditionFlag.AboveEqual, bb5);
    masm.xorpd(temp6, temp6);
    masm.movl(gpr4, 17080);
    masm.pinsrw(temp6, gpr4, 3);
    masm.movdqu(temp2, temp1);
    masm.pand(temp4, temp1);
    masm.subsd(temp1, temp4);
    masm.mulsd(temp4, temp5);
    masm.addsd(dest, temp7);
    masm.mulsd(temp1, temp5);
    masm.movdqu(temp7, temp6);
    masm.addsd(temp6, temp4);
    masm.leaq(gpr7, externalAddress(tExpPtr));
    masm.addpd(temp3, dest);
    masm.movdl(gpr4, temp6);
    masm.movl(gpr3, gpr4);
    masm.andl(gpr4, 255);
    masm.addl(gpr4, gpr4);
    masm.movdqu(temp5, new AMD64Address(gpr7, gpr4, Scale.Times8, 0));
    masm.subsd(temp6, temp7);
    masm.pshufd(dest, temp3, 0xEE);
    masm.subsd(temp4, temp6);
    masm.addsd(dest, temp3);
    masm.addsd(temp4, temp1);
    masm.mulsd(temp2, dest);
    masm.leaq(gpr8, externalAddress(eCoeffPtr));
    // 0xe78a6731,
    masm.movdqu(temp7, new AMD64Address(gpr8, 0));
    // 0x3f55d87f,
    // 0xd704a0c0,
    // 0x3fac6b08
    // 0x6fba4e77,
    masm.movdqu(temp3, new AMD64Address(gpr8, 16));
    // 0x3f83b2ab,
    // 0xff82c58f,
    // 0x3fcebfbd
    masm.shll(gpr3, 12);
    masm.xorl(gpr3, gpr5);
    masm.andl(gpr3, -1048576);
    masm.movdq(temp6, gpr3);
    masm.addsd(temp2, temp4);
    masm.movq(gpr2, 0x3fe62e42fefa39efL);
    masm.movdq(temp1, gpr2);
    masm.pshufd(dest, temp2, 0x44);
    masm.pshufd(temp4, temp2, 0x44);
    masm.mulsd(temp1, temp2);
    masm.pshufd(temp6, temp6, 0x11);
    masm.mulpd(dest, dest);
    masm.mulpd(temp7, temp4);
    masm.paddd(temp5, temp6);
    masm.mulsd(temp1, temp5);
    masm.pshufd(temp6, temp5, 0xEE);
    masm.mulsd(dest, dest);
    masm.addpd(temp3, temp7);
    masm.addsd(temp1, temp6);
    masm.mulpd(dest, temp3);
    masm.pshufd(temp3, dest, 0xEE);
    masm.mulsd(dest, temp5);
    masm.mulsd(temp3, temp5);
    masm.addsd(dest, temp1);
    masm.addsd(dest, temp3);
    masm.addsd(dest, temp5);
    masm.jmp(bb56);
    masm.bind(bb0);
    masm.addl(gpr1, 16);
    masm.movl(gpr4, 32752);
    masm.andl(gpr4, gpr1);
    masm.cmpl(gpr4, 32752);
    masm.jcc(ConditionFlag.Equal, bb6);
    masm.testl(gpr1, 32768);
    masm.jcc(ConditionFlag.NotEqual, bb7);
    masm.bind(bb8);
    masm.movdqu(dest, temp10);
    masm.movdqu(temp3, temp10);
    masm.movdl(gpr4, temp3);
    masm.psrlq(temp3, 32);
    masm.movdl(gpr3, temp3);
    masm.orl(gpr4, gpr3);
    masm.cmpl(gpr4, 0);
    masm.jcc(ConditionFlag.Equal, bb9);
    masm.xorpd(temp3, temp3);
    masm.movl(gpr1, 18416);
    masm.pinsrw(temp3, gpr1, 3);
    masm.mulsd(dest, temp3);
    masm.xorpd(temp2, temp2);
    masm.movl(gpr1, 16368);
    masm.pinsrw(temp2, gpr1, 3);
    masm.movdqu(temp3, dest);
    masm.pextrw(gpr1, dest, 3);
    masm.por(dest, temp2);
    masm.movl(gpr3, 18416);
    masm.psrlq(dest, 27);
    masm.psrld(dest, 2);
    masm.rcpps(dest, dest);
    masm.psllq(temp3, 12);
    // 0x00000000,
    masm.movdqu(temp6, externalAddress(highSigMaskPtr));
    // 0xfffff800,
    // 0x00000000,
    // 0xfffff800
    masm.psrlq(temp3, 12);
    masm.mulss(dest, temp7);
    masm.movl(gpr4, -1024);
    masm.movdl(temp5, gpr4);
    masm.por(temp3, temp1);
    masm.paddd(dest, temp4);
    masm.psllq(temp5, 32);
    masm.movdl(gpr4, dest);
    masm.psllq(dest, 29);
    masm.pand(temp5, temp3);
    masm.movl(gpr5, 0);
    masm.pand(dest, temp6);
    masm.subsd(temp3, temp5);
    masm.andl(gpr1, 32752);
    masm.subl(gpr1, 18416);
    masm.sarl(gpr1, 4);
    masm.cvtsi2sdl(temp7, gpr1);
    masm.mulpd(temp5, dest);
    masm.jmp(bb4);
    masm.bind(bb10);
    masm.movdqu(dest, temp10);
    masm.movdqu(temp3, temp10);
    masm.movdl(gpr4, temp3);
    masm.psrlq(temp3, 32);
    masm.movdl(gpr3, temp3);
    masm.orl(gpr4, gpr3);
    masm.cmpl(gpr4, 0);
    masm.jcc(ConditionFlag.Equal, bb9);
    masm.xorpd(temp3, temp3);
    masm.movl(gpr1, 18416);
    masm.pinsrw(temp3, gpr1, 3);
    masm.mulsd(dest, temp3);
    masm.xorpd(temp2, temp2);
    masm.movl(gpr1, 16368);
    masm.pinsrw(temp2, gpr1, 3);
    masm.movdqu(temp3, dest);
    masm.pextrw(gpr1, dest, 3);
    masm.por(dest, temp2);
    masm.movl(gpr3, 18416);
    masm.psrlq(dest, 27);
    masm.psrld(dest, 2);
    masm.rcpps(dest, dest);
    masm.psllq(temp3, 12);
    // 0x00000000,
    masm.movdqu(temp6, externalAddress(highSigMaskPtr));
    // 0xfffff800,
    // 0x00000000,
    // 0xfffff800
    masm.psrlq(temp3, 12);
    masm.mulss(dest, temp7);
    masm.movl(gpr4, -1024);
    masm.movdl(temp5, gpr4);
    masm.por(temp3, temp1);
    masm.paddd(dest, temp4);
    masm.psllq(temp5, 32);
    masm.movdl(gpr4, dest);
    masm.psllq(dest, 29);
    masm.pand(temp5, temp3);
    masm.movl(gpr5, Integer.MIN_VALUE);
    masm.pand(dest, temp6);
    masm.subsd(temp3, temp5);
    masm.andl(gpr1, 32752);
    masm.subl(gpr1, 18416);
    masm.sarl(gpr1, 4);
    masm.cvtsi2sdl(temp7, gpr1);
    masm.mulpd(temp5, dest);
    masm.jmp(bb4);
    masm.bind(bb5);
    masm.cmpl(gpr1, 0);
    masm.jcc(ConditionFlag.Less, bb11);
    masm.cmpl(gpr1, 752);
    masm.jcc(ConditionFlag.AboveEqual, bb12);
    masm.addsd(dest, temp7);
    // 0xf8000000,
    masm.movq(temp4, externalAddress(halfmaskPtr));
    // 0xffffffff
    masm.addpd(temp3, dest);
    masm.xorpd(temp6, temp6);
    masm.movl(gpr1, 17080);
    masm.pinsrw(temp6, gpr1, 3);
    masm.pshufd(dest, temp3, 0xEE);
    masm.addsd(dest, temp3);
    masm.movdqu(temp3, temp5);
    masm.addsd(temp5, dest);
    masm.subsd(temp3, temp5);
    masm.movdqu(temp7, temp5);
    masm.pand(temp5, temp4);
    masm.movdqu(temp2, temp1);
    masm.pand(temp4, temp1);
    masm.subsd(temp7, temp5);
    masm.addsd(dest, temp3);
    masm.subsd(temp1, temp4);
    masm.mulsd(temp4, temp5);
    masm.addsd(dest, temp7);
    masm.mulsd(temp2, dest);
    masm.movdqu(temp7, temp6);
    masm.mulsd(temp1, temp5);
    masm.addsd(temp6, temp4);
    masm.movdl(gpr1, temp6);
    masm.subsd(temp6, temp7);
    masm.leaq(gpr7, externalAddress(tExpPtr));
    masm.movl(gpr3, gpr1);
    masm.andl(gpr1, 255);
    masm.addl(gpr1, gpr1);
    masm.movdqu(temp5, new AMD64Address(gpr7, gpr1, Scale.Times8, 0));
    masm.addsd(temp2, temp1);
    masm.leaq(gpr8, externalAddress(eCoeffPtr));
    // 0xe78a6731,
    masm.movdqu(temp7, new AMD64Address(gpr8, 0));
    // 0x3f55d87f,
    // 0xd704a0c0,
    // 0x3fac6b08
    // 0x6fba4e77,
    masm.movdqu(temp3, new AMD64Address(gpr8, 16));
    // 0x3f83b2ab,
    // 0xff82c58f,
    // 0x3fcebfbd
    masm.subsd(temp4, temp6);
    masm.pextrw(gpr4, temp6, 3);
    masm.addsd(temp2, temp4);
    masm.sarl(gpr3, 8);
    masm.movl(gpr1, gpr3);
    masm.sarl(gpr3, 1);
    masm.subl(gpr1, gpr3);
    masm.shll(gpr3, 20);
    masm.xorl(gpr3, gpr5);
    masm.movdl(temp6, gpr3);
    // 0xfefa39ef,
    masm.movq(temp1, new AMD64Address(gpr8, 32));
    // 0x3fe62e42
    masm.andl(gpr4, 32767);
    masm.cmpl(gpr4, 16529);
    masm.jcc(ConditionFlag.Above, bb12);
    masm.pshufd(dest, temp2, 0x44);
    masm.pshufd(temp4, temp2, 0x44);
    masm.mulpd(dest, dest);
    masm.mulpd(temp7, temp4);
    masm.pshufd(temp6, temp6, 0x11);
    masm.mulsd(temp1, temp2);
    masm.mulsd(dest, dest);
    masm.paddd(temp5, temp6);
    masm.addpd(temp3, temp7);
    masm.mulsd(temp1, temp5);
    masm.pshufd(temp6, temp5, 0xEE);
    masm.mulpd(dest, temp3);
    masm.addsd(temp1, temp6);
    masm.pshufd(temp3, dest, 0xEE);
    masm.mulsd(dest, temp5);
    masm.mulsd(temp3, temp5);
    masm.shll(gpr1, 4);
    masm.xorpd(temp4, temp4);
    masm.addl(gpr1, 16368);
    masm.pinsrw(temp4, gpr1, 3);
    masm.addsd(dest, temp1);
    masm.addsd(dest, temp3);
    masm.movdqu(temp1, dest);
    masm.addsd(dest, temp5);
    masm.mulsd(dest, temp4);
    masm.pextrw(gpr1, dest, 3);
    masm.andl(gpr1, 32752);
    masm.jcc(ConditionFlag.Equal, bb13);
    masm.cmpl(gpr1, 32752);
    masm.jcc(ConditionFlag.Equal, bb14);
    masm.jmp(bb56);
    masm.bind(bb6);
    masm.movdqu(temp1, temp8);
    masm.movdqu(dest, temp10);
    masm.movdqu(temp2, dest);
    masm.movdl(gpr1, temp2);
    masm.psrlq(temp2, 20);
    masm.movdl(gpr4, temp2);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.Equal, bb15);
    masm.movdl(gpr1, temp1);
    masm.psrlq(temp1, 32);
    masm.movdl(gpr4, temp1);
    masm.movl(gpr3, gpr4);
    masm.addl(gpr4, gpr4);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.Equal, bb16);
    masm.addsd(dest, dest);
    masm.jmp(bb56);
    masm.bind(bb16);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 16368);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb18);
    masm.addpd(dest, temp8);
    masm.jmp(bb56);
    masm.bind(bb15);
    masm.movdl(gpr1, temp1);
    masm.movdqu(temp2, temp1);
    masm.psrlq(temp1, 32);
    masm.movdl(gpr4, temp1);
    masm.movl(gpr3, gpr4);
    masm.addl(gpr4, gpr4);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.Equal, bb19);
    masm.pextrw(gpr1, temp2, 3);
    masm.andl(gpr1, 32752);
    masm.cmpl(gpr1, 32752);
    masm.jcc(ConditionFlag.NotEqual, bb20);
    masm.movdl(gpr1, temp2);
    masm.psrlq(temp2, 20);
    masm.movdl(gpr4, temp2);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.NotEqual, bb18);
    masm.bind(bb20);
    masm.pextrw(gpr1, dest, 3);
    masm.testl(gpr1, 32768);
    masm.jcc(ConditionFlag.NotEqual, bb21);
    masm.testl(gpr3, Integer.MIN_VALUE);
    masm.jcc(ConditionFlag.NotZero, bb22);
    masm.jmp(bb56);
    masm.bind(bb23);
    masm.movdl(gpr1, temp8);
    masm.testl(gpr1, 1);
    masm.jcc(ConditionFlag.NotEqual, bb24);
    masm.testl(gpr1, 2);
    masm.jcc(ConditionFlag.NotEqual, bb25);
    masm.jmp(bb24);
    masm.bind(bb21);
    masm.shrl(gpr3, 20);
    masm.andl(gpr3, 2047);
    masm.cmpl(gpr3, 1075);
    masm.jcc(ConditionFlag.Above, bb24);
    masm.jcc(ConditionFlag.Equal, bb26);
    masm.cmpl(gpr3, 1074);
    masm.jcc(ConditionFlag.Above, bb23);
    masm.cmpl(gpr3, 1023);
    masm.jcc(ConditionFlag.Below, bb24);
    masm.movdqu(temp1, temp8);
    masm.movl(gpr1, 17208);
    masm.xorpd(temp3, temp3);
    masm.pinsrw(temp3, gpr1, 3);
    masm.movdqu(temp4, temp3);
    masm.addsd(temp3, temp1);
    masm.subsd(temp4, temp3);
    masm.addsd(temp1, temp4);
    masm.pextrw(gpr1, temp1, 3);
    masm.andl(gpr1, 32752);
    masm.jcc(ConditionFlag.NotEqual, bb24);
    masm.movdl(gpr1, temp3);
    masm.andl(gpr1, 1);
    masm.jcc(ConditionFlag.Equal, bb24);
    masm.bind(bb25);
    masm.pextrw(gpr1, temp8, 3);
    masm.andl(gpr1, 32768);
    masm.jcc(ConditionFlag.NotEqual, bb27);
    masm.jmp(bb56);
    masm.bind(bb27);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 32768);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb24);
    masm.pextrw(gpr1, temp8, 3);
    masm.andl(gpr1, 32768);
    masm.jcc(ConditionFlag.NotEqual, bb22);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 32752);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb26);
    masm.movdl(gpr1, temp8);
    masm.andl(gpr1, 1);
    masm.jcc(ConditionFlag.Equal, bb24);
    masm.jmp(bb25);
    masm.bind(bb28);
    masm.movdl(gpr1, temp1);
    masm.psrlq(temp1, 20);
    masm.movdl(gpr4, temp1);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.Equal, bb29);
    masm.addsd(dest, temp8);
    masm.jmp(bb56);
    masm.bind(bb29);
    masm.movdqu(dest, temp10);
    masm.pextrw(gpr1, dest, 3);
    masm.cmpl(gpr1, 49136);
    masm.jcc(ConditionFlag.NotEqual, bb30);
    masm.movdl(gpr3, dest);
    masm.psrlq(dest, 20);
    masm.movdl(gpr4, dest);
    masm.orl(gpr3, gpr4);
    masm.jcc(ConditionFlag.NotEqual, bb30);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 32760);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb30);
    masm.andl(gpr1, 32752);
    masm.subl(gpr1, 16368);
    masm.pextrw(gpr4, temp8, 3);
    masm.xorpd(dest, dest);
    masm.xorl(gpr1, gpr4);
    masm.andl(gpr1, 32768);
    masm.jcc(ConditionFlag.Equal, bb31);
    masm.jmp(bb56);
    masm.bind(bb31);
    masm.movl(gpr3, 32752);
    masm.pinsrw(dest, gpr3, 3);
    masm.jmp(bb56);
    masm.bind(bb32);
    masm.movdl(gpr1, temp1);
    masm.cmpl(gpr4, 17184);
    masm.jcc(ConditionFlag.Above, bb33);
    masm.testl(gpr1, 1);
    masm.jcc(ConditionFlag.NotEqual, bb34);
    masm.testl(gpr1, 2);
    masm.jcc(ConditionFlag.Equal, bb35);
    masm.jmp(bb36);
    masm.bind(bb33);
    masm.testl(gpr1, 1);
    masm.jcc(ConditionFlag.Equal, bb35);
    masm.jmp(bb36);
    masm.bind(bb7);
    masm.movdqu(temp2, temp10);
    masm.movdl(gpr1, temp2);
    masm.psrlq(temp2, 31);
    masm.movdl(gpr3, temp2);
    masm.orl(gpr1, gpr3);
    masm.jcc(ConditionFlag.Equal, bb9);
    masm.pextrw(gpr4, temp8, 3);
    masm.movdl(gpr1, temp8);
    masm.movdqu(temp2, temp8);
    masm.psrlq(temp2, 32);
    masm.movdl(gpr3, temp2);
    masm.addl(gpr3, gpr3);
    masm.orl(gpr3, gpr1);
    masm.jcc(ConditionFlag.Equal, bb37);
    masm.andl(gpr4, 32752);
    masm.cmpl(gpr4, 32752);
    masm.jcc(ConditionFlag.Equal, bb28);
    masm.cmpl(gpr4, 17200);
    masm.jcc(ConditionFlag.Above, bb35);
    masm.cmpl(gpr4, 17184);
    masm.jcc(ConditionFlag.AboveEqual, bb32);
    masm.cmpl(gpr4, 16368);
    masm.jcc(ConditionFlag.Below, bb34);
    masm.movl(gpr1, 17208);
    masm.xorpd(temp2, temp2);
    masm.pinsrw(temp2, gpr1, 3);
    masm.movdqu(temp4, temp2);
    masm.addsd(temp2, temp1);
    masm.subsd(temp4, temp2);
    masm.addsd(temp1, temp4);
    masm.pextrw(gpr1, temp1, 3);
    masm.andl(gpr1, 32767);
    masm.jcc(ConditionFlag.NotEqual, bb34);
    masm.movdl(gpr1, temp2);
    masm.andl(gpr1, 1);
    masm.jcc(ConditionFlag.Equal, bb35);
    masm.bind(bb36);
    masm.xorpd(temp1, temp1);
    masm.movl(gpr4, 30704);
    masm.pinsrw(temp1, gpr4, 3);
    masm.pextrw(gpr1, temp10, 3);
    masm.movl(gpr4, 8192);
    masm.movdl(temp4, gpr4);
    masm.andl(gpr1, 32767);
    masm.subl(gpr1, 16);
    masm.jcc(ConditionFlag.Less, bb10);
    masm.movl(gpr4, gpr1);
    masm.andl(gpr4, 32752);
    masm.subl(gpr4, 16368);
    masm.movl(gpr3, gpr4);
    masm.sarl(gpr4, 31);
    masm.addl(gpr3, gpr4);
    masm.xorl(gpr3, gpr4);
    masm.addl(gpr3, 16);
    masm.bsrl(gpr3, gpr3);
    masm.movl(gpr5, Integer.MIN_VALUE);
    masm.jmp(bb1);
    masm.bind(bb34);
    masm.xorpd(temp1, temp1);
    masm.movl(gpr1, 32752);
    masm.pinsrw(temp1, gpr1, 3);
    masm.xorpd(dest, dest);
    masm.mulsd(dest, temp1);
    masm.jmp(bb56);
    masm.bind(bb35);
    masm.xorpd(temp1, temp1);
    masm.movl(gpr4, 30704);
    masm.pinsrw(temp1, gpr4, 3);
    masm.pextrw(gpr1, temp10, 3);
    masm.movl(gpr4, 8192);
    masm.movdl(temp4, gpr4);
    masm.andl(gpr1, 32767);
    masm.subl(gpr1, 16);
    masm.jcc(ConditionFlag.Less, bb8);
    masm.movl(gpr4, gpr1);
    masm.andl(gpr4, 32752);
    masm.subl(gpr4, 16368);
    masm.movl(gpr3, gpr4);
    masm.sarl(gpr4, 31);
    masm.addl(gpr3, gpr4);
    masm.xorl(gpr3, gpr4);
    masm.addl(gpr3, 16);
    masm.bsrl(gpr3, gpr3);
    masm.movl(gpr5, 0);
    masm.jmp(bb1);
    masm.bind(bb19);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 16368);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb22);
    masm.xorpd(dest, dest);
    masm.jmp(bb56);
    masm.bind(bb11);
    masm.addl(gpr1, 384);
    masm.cmpl(gpr1, 0);
    masm.jcc(ConditionFlag.Less, bb38);
    masm.mulsd(temp5, temp1);
    masm.addsd(dest, temp7);
    masm.shrl(gpr5, 31);
    masm.addpd(temp3, dest);
    masm.pshufd(dest, temp3, 0xEE);
    masm.addsd(temp3, dest);
    // 0xfefa39ef,
    masm.leaq(gpr7, externalAddress(logTwoPowPtr));
    // 0x3fe62e42,
    // 0xfefa39ef,
    // 0xbfe62e42
    masm.movq(temp4, new AMD64Address(gpr7, gpr5, Scale.Times8, 0));
    masm.mulsd(temp1, temp3);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 16368);
    masm.shll(gpr5, 15);
    masm.orl(gpr1, gpr5);
    masm.pinsrw(dest, gpr1, 3);
    masm.addsd(temp5, temp1);
    masm.mulsd(temp5, temp4);
    masm.addsd(dest, temp5);
    masm.jmp(bb56);
    masm.bind(bb38);
    masm.bind(bb37);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 16368);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb39);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 16368);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb9);
    masm.movdqu(temp2, temp8);
    masm.pextrw(gpr1, temp8, 3);
    masm.andl(gpr1, 32752);
    masm.cmpl(gpr1, 32752);
    masm.jcc(ConditionFlag.NotEqual, bb40);
    masm.movdl(gpr1, temp2);
    masm.psrlq(temp2, 20);
    masm.movdl(gpr4, temp2);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.NotEqual, bb18);
    masm.bind(bb40);
    masm.movdl(gpr1, temp1);
    masm.psrlq(temp1, 32);
    masm.movdl(gpr4, temp1);
    masm.movl(gpr3, gpr4);
    masm.addl(gpr4, gpr4);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.Equal, bb39);
    masm.shrl(gpr4, 21);
    masm.cmpl(gpr4, 1075);
    masm.jcc(ConditionFlag.Above, bb41);
    masm.jcc(ConditionFlag.Equal, bb42);
    masm.cmpl(gpr4, 1023);
    masm.jcc(ConditionFlag.Below, bb41);
    masm.movdqu(temp1, temp8);
    masm.movl(gpr1, 17208);
    masm.xorpd(temp3, temp3);
    masm.pinsrw(temp3, gpr1, 3);
    masm.movdqu(temp4, temp3);
    masm.addsd(temp3, temp1);
    masm.subsd(temp4, temp3);
    masm.addsd(temp1, temp4);
    masm.pextrw(gpr1, temp1, 3);
    masm.andl(gpr1, 32752);
    masm.jcc(ConditionFlag.NotEqual, bb41);
    masm.movdl(gpr1, temp3);
    masm.andl(gpr1, 1);
    masm.jcc(ConditionFlag.Equal, bb41);
    masm.bind(bb43);
    masm.movdqu(dest, temp10);
    masm.testl(gpr3, Integer.MIN_VALUE);
    masm.jcc(ConditionFlag.NotEqual, bb44);
    masm.jmp(bb56);
    masm.bind(bb42);
    masm.movdl(gpr1, temp8);
    masm.testl(gpr1, 1);
    masm.jcc(ConditionFlag.NotEqual, bb43);
    masm.bind(bb41);
    masm.testl(gpr3, Integer.MIN_VALUE);
    masm.jcc(ConditionFlag.Equal, bb22);
    masm.xorpd(dest, dest);
    masm.bind(bb44);
    masm.movl(gpr1, 16368);
    masm.xorpd(temp1, temp1);
    masm.pinsrw(temp1, gpr1, 3);
    masm.divsd(temp1, dest);
    masm.movdqu(dest, temp1);
    masm.jmp(bb56);
    masm.bind(bb12);
    masm.pextrw(gpr1, temp10, 3);
    masm.pextrw(gpr4, temp8, 3);
    masm.movl(gpr3, 32752);
    masm.andl(gpr3, gpr4);
    masm.cmpl(gpr3, 32752);
    masm.jcc(ConditionFlag.Equal, bb45);
    masm.andl(gpr1, 32752);
    masm.subl(gpr1, 16368);
    masm.xorl(gpr4, gpr1);
    masm.testl(gpr4, 32768);
    masm.jcc(ConditionFlag.NotEqual, bb46);
    masm.bind(bb47);
    masm.movl(gpr1, 32736);
    masm.pinsrw(dest, gpr1, 3);
    masm.shrl(gpr5, 16);
    masm.orl(gpr1, gpr5);
    masm.pinsrw(temp1, gpr1, 3);
    masm.mulsd(dest, temp1);
    masm.bind(bb14);
    masm.jmp(bb56);
    masm.bind(bb46);
    masm.movl(gpr1, 16);
    masm.pinsrw(dest, gpr1, 3);
    masm.mulsd(dest, dest);
    masm.testl(gpr3, Integer.MIN_VALUE);
    masm.jcc(ConditionFlag.Equal, bb48);
    masm.movq(gpr2, 0x8000000000000000L);
    masm.movdq(temp2, gpr2);
    masm.xorpd(dest, temp2);
    masm.bind(bb48);
    masm.jmp(bb56);
    masm.bind(bb13);
    masm.pextrw(gpr3, temp5, 3);
    masm.pextrw(gpr4, temp4, 3);
    masm.movl(gpr1, -1);
    masm.andl(gpr3, 32752);
    masm.subl(gpr3, 16368);
    masm.andl(gpr4, 32752);
    masm.addl(gpr4, gpr3);
    masm.movl(gpr3, -31);
    masm.sarl(gpr4, 4);
    masm.subl(gpr3, gpr4);
    masm.jcc(ConditionFlag.LessEqual, bb49);
    masm.cmpl(gpr3, 20);
    masm.jcc(ConditionFlag.Above, bb50);
    masm.shll(gpr1);
    masm.bind(bb49);
    masm.movdl(dest, gpr1);
    masm.psllq(dest, 32);
    masm.pand(dest, temp5);
    masm.subsd(temp5, dest);
    masm.addsd(temp5, temp1);
    masm.mulsd(dest, temp4);
    masm.mulsd(temp5, temp4);
    masm.addsd(dest, temp5);
    masm.bind(bb50);
    masm.jmp(bb48);
    masm.bind(bb2);
    masm.pextrw(gpr3, temp8, 3);
    masm.movl(gpr4, Integer.MIN_VALUE);
    masm.movdl(temp1, gpr4);
    masm.xorpd(temp7, temp7);
    masm.paddd(dest, temp4);
    masm.movdl(gpr4, dest);
    masm.psllq(dest, 29);
    masm.paddq(temp1, temp3);
    masm.pand(temp5, temp1);
    masm.andl(gpr3, 32752);
    masm.cmpl(gpr3, 16560);
    masm.jcc(ConditionFlag.Less, bb3);
    masm.leaq(gpr7, externalAddress(lTblPowPtr));
    masm.leaq(gpr8, externalAddress(coeffHPtr));
    // 0x00000000,
    masm.movdqu(temp4, new AMD64Address(gpr8, 0));
    // 0xbfd61a00,
    // 0x00000000,
    // 0xbf5dabe1
    masm.pand(dest, temp6);
    masm.subsd(temp3, temp5);
    masm.addl(gpr1, 16351);
    masm.shrl(gpr1, 4);
    masm.subl(gpr1, 1022);
    masm.cvtsi2sdl(temp7, gpr1);
    masm.mulpd(temp5, dest);
    masm.mulsd(temp3, dest);
    masm.subsd(temp5, temp9);
    masm.pshufd(temp1, temp4, 0xE);
    masm.pshufd(temp2, temp3, 0x44);
    masm.unpcklpd(temp5, temp3);
    masm.addsd(temp3, temp5);
    masm.andl(gpr4, 16760832);
    masm.shrl(gpr4, 10);
    masm.addpd(temp7, new AMD64Address(gpr7, gpr4, Scale.Times1, -3648));
    masm.movdqu(temp6, temp4);
    masm.mulsd(temp4, temp5);
    masm.movdqu(dest, temp1);
    masm.mulsd(dest, temp5);
    masm.mulsd(temp6, temp2);
    masm.mulsd(temp1, temp2);
    masm.movdqu(temp2, temp5);
    masm.mulsd(temp4, temp5);
    masm.addsd(temp5, dest);
    masm.movdqu(dest, temp7);
    masm.addsd(temp2, temp3);
    masm.addsd(temp7, temp5);
    masm.mulsd(temp6, temp2);
    masm.subsd(dest, temp7);
    masm.movdqu(temp2, temp7);
    masm.addsd(temp7, temp4);
    masm.addsd(dest, temp5);
    masm.subsd(temp2, temp7);
    masm.addsd(temp4, temp2);
    masm.pshufd(temp2, temp5, 0xEE);
    masm.movdqu(temp5, temp7);
    masm.addsd(temp7, temp2);
    masm.addsd(temp4, dest);
    masm.leaq(gpr8, externalAddress(coeffPowPtr));
    // 0x6dc96112,
    masm.movdqu(dest, new AMD64Address(gpr8, 0));
    // 0xbf836578,
    // 0xee241472,
    // 0xbf9b0301
    masm.subsd(temp5, temp7);
    masm.addsd(temp6, temp4);
    masm.movdqu(temp4, temp7);
    masm.addsd(temp5, temp2);
    masm.addsd(temp7, temp1);
    // 0x486ececc,
    masm.movdqu(temp2, new AMD64Address(gpr8, 64));
    // 0x3fc4635e,
    // 0x161bb241,
    // 0xbf5dabe1
    masm.subsd(temp4, temp7);
    masm.addsd(temp6, temp5);
    masm.addsd(temp4, temp1);
    masm.pshufd(temp5, temp7, 0xEE);
    masm.movapd(temp1, temp7);
    masm.addsd(temp7, temp5);
    masm.subsd(temp1, temp7);
    masm.addsd(temp1, temp5);
    // 0x9f95985a,
    masm.movdqu(temp5, new AMD64Address(gpr8, 80));
    // 0xbfb528db,
    // 0xf8b5787d,
    // 0x3ef2531e
    masm.pshufd(temp3, temp3, 0x44);
    masm.addsd(temp6, temp4);
    masm.addsd(temp6, temp1);
    // 0x9f95985a,
    masm.movdqu(temp1, new AMD64Address(gpr8, 32));
    // 0xbfb528db,
    // 0xb3841d2a,
    // 0xbfd619b6
    masm.mulpd(dest, temp3);
    masm.mulpd(temp2, temp3);
    masm.pshufd(temp4, temp3, 0x44);
    masm.mulpd(temp3, temp3);
    masm.addpd(dest, temp1);
    masm.addpd(temp5, temp2);
    masm.mulsd(temp4, temp3);
    // 0xf8000000,
    masm.movq(temp2, externalAddress(highmaskLogXPtr));
    // 0xffffffff
    masm.mulpd(temp3, temp3);
    masm.movdqu(temp1, temp8);
    masm.pextrw(gpr3, temp8, 3);
    masm.mulpd(dest, temp4);
    masm.pextrw(gpr1, temp7, 3);
    masm.mulpd(temp5, temp4);
    masm.mulpd(dest, temp3);
    masm.leaq(gpr8, externalAddress(highmaskYPtr));
    // 0x00000000,
    masm.movq(temp4, new AMD64Address(gpr8, 8));
    // 0xffffffff
    masm.pand(temp2, temp7);
    masm.addsd(temp5, temp6);
    masm.subsd(temp7, temp2);
    masm.addpd(temp5, dest);
    masm.andl(gpr1, 32752);
    masm.subl(gpr1, 16368);
    masm.andl(gpr3, 32752);
    masm.cmpl(gpr3, 32752);
    masm.jcc(ConditionFlag.Equal, bb45);
    masm.addl(gpr3, gpr1);
    masm.cmpl(gpr3, 16576);
    masm.jcc(ConditionFlag.AboveEqual, bb51);
    masm.pshufd(dest, temp5, 0xEE);
    masm.pand(temp4, temp1);
    masm.movdqu(temp3, temp1);
    masm.addsd(temp5, dest);
    masm.subsd(temp1, temp4);
    masm.xorpd(temp6, temp6);
    masm.movl(gpr4, 17080);
    masm.pinsrw(temp6, gpr4, 3);
    masm.addsd(temp7, temp5);
    masm.mulsd(temp4, temp2);
    masm.mulsd(temp1, temp2);
    masm.movdqu(temp5, temp6);
    masm.mulsd(temp3, temp7);
    masm.addsd(temp6, temp4);
    masm.addsd(temp1, temp3);
    masm.leaq(gpr8, externalAddress(eCoeffPtr));
    // 0xe78a6731,
    masm.movdqu(temp7, new AMD64Address(gpr8, 0));
    // 0x3f55d87f,
    // 0xd704a0c0,
    // 0x3fac6b08
    masm.movdl(gpr4, temp6);
    masm.subsd(temp6, temp5);
    masm.leaq(gpr7, externalAddress(tExpPtr));
    masm.movl(gpr3, gpr4);
    masm.andl(gpr4, 255);
    masm.addl(gpr4, gpr4);
    masm.movdqu(temp5, new AMD64Address(gpr7, gpr4, Scale.Times8, 0));
    // 0x6fba4e77,
    masm.movdqu(temp3, new AMD64Address(gpr8, 16));
    // 0x3f83b2ab,
    // 0xff82c58f,
    // 0x3fcebfbd
    // 0xfefa39ef,
    masm.movq(temp2, new AMD64Address(gpr8, 32));
    // 0x3fe62e42
    masm.subsd(temp4, temp6);
    masm.addsd(temp4, temp1);
    masm.pextrw(gpr4, temp6, 3);
    masm.shrl(gpr3, 8);
    masm.movl(gpr1, gpr3);
    masm.shrl(gpr3, 1);
    masm.subl(gpr1, gpr3);
    masm.shll(gpr3, 20);
    masm.movdl(temp6, gpr3);
    masm.pshufd(dest, temp4, 0x44);
    masm.pshufd(temp1, temp4, 0x44);
    masm.mulpd(dest, dest);
    masm.mulpd(temp7, temp1);
    masm.pshufd(temp6, temp6, 0x11);
    masm.mulsd(temp2, temp4);
    masm.andl(gpr4, 32767);
    masm.cmpl(gpr4, 16529);
    masm.jcc(ConditionFlag.Above, bb12);
    masm.mulsd(dest, dest);
    masm.paddd(temp5, temp6);
    masm.addpd(temp3, temp7);
    masm.mulsd(temp2, temp5);
    masm.pshufd(temp6, temp5, 0xEE);
    masm.mulpd(dest, temp3);
    masm.addsd(temp2, temp6);
    masm.pshufd(temp3, dest, 0xEE);
    masm.addl(gpr1, 1023);
    masm.shll(gpr1, 20);
    masm.orl(gpr1, gpr5);
    masm.movdl(temp4, gpr1);
    masm.mulsd(dest, temp5);
    masm.mulsd(temp3, temp5);
    masm.addsd(dest, temp2);
    masm.psllq(temp4, 32);
    masm.addsd(dest, temp3);
    masm.movdqu(temp1, dest);
    masm.addsd(dest, temp5);
    masm.mulsd(dest, temp4);
    masm.pextrw(gpr1, dest, 3);
    masm.andl(gpr1, 32752);
    masm.jcc(ConditionFlag.Equal, bb13);
    masm.cmpl(gpr1, 32752);
    masm.jcc(ConditionFlag.Equal, bb14);
    masm.jmp(bb56);
    masm.bind(bb45);
    masm.movdqu(dest, temp10);
    masm.xorpd(temp2, temp2);
    masm.movl(gpr1, 49136);
    masm.pinsrw(temp2, gpr1, 3);
    masm.addsd(temp2, dest);
    masm.pextrw(gpr1, temp2, 3);
    masm.cmpl(gpr1, 0);
    masm.jcc(ConditionFlag.NotEqual, bb53);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 32760);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb53);
    masm.movdqu(temp1, temp8);
    masm.movdl(gpr4, temp1);
    masm.movdqu(temp3, temp1);
    masm.psrlq(temp3, 20);
    masm.movdl(gpr3, temp3);
    masm.orl(gpr3, gpr4);
    masm.jcc(ConditionFlag.Equal, bb54);
    masm.addsd(temp1, temp1);
    masm.movdqu(dest, temp1);
    masm.jmp(bb56);
    masm.bind(bb51);
    masm.pextrw(gpr1, temp1, 3);
    masm.pextrw(gpr3, temp2, 3);
    masm.xorl(gpr1, gpr3);
    masm.testl(gpr1, 32768);
    masm.jcc(ConditionFlag.Equal, bb47);
    masm.jmp(bb46);
    masm.bind(bb54);
    masm.pextrw(gpr1, dest, 3);
    masm.andl(gpr1, 32752);
    masm.pextrw(gpr4, temp1, 3);
    masm.xorpd(dest, dest);
    masm.subl(gpr1, 16368);
    masm.xorl(gpr1, gpr4);
    masm.testl(gpr1, 32768);
    masm.jcc(ConditionFlag.Equal, bb55);
    masm.jmp(bb56);
    masm.bind(bb55);
    masm.movl(gpr4, 32752);
    masm.pinsrw(dest, gpr4, 3);
    masm.jmp(bb56);
    masm.bind(bb56);
}
Also used : Register(jdk.vm.ci.code.Register) ValueUtil.asRegister(jdk.vm.ci.code.ValueUtil.asRegister) ArrayDataPointerConstant(org.graalvm.compiler.lir.asm.ArrayDataPointerConstant) Label(org.graalvm.compiler.asm.Label) AMD64Address(org.graalvm.compiler.asm.amd64.AMD64Address)

Example 32 with Label

use of org.graalvm.compiler.asm.Label in project graal by oracle.

the class AArch64ArrayEqualsOp method emitTailCompares.

/**
 * Emits code to compare the remaining 1 to 4 bytes.
 */
private void emitTailCompares(AArch64MacroAssembler masm, Register result, Register array1, Register array2, Label breakLabel, Register rscratch1) {
    Label compare2Bytes = new Label();
    Label compare1Byte = new Label();
    Label end = new Label();
    Register temp = asRegister(temp4);
    if (kind.getByteCount() <= 4) {
        // Compare trailing 4 bytes, if any.
        masm.ands(32, zr, result, 4);
        masm.branchConditionally(ConditionFlag.EQ, compare2Bytes);
        masm.ldr(32, temp, AArch64Address.createPostIndexedImmediateAddress(array1, 4));
        masm.ldr(32, rscratch1, AArch64Address.createPostIndexedImmediateAddress(array2, 4));
        masm.eor(32, rscratch1, temp, rscratch1);
        masm.cbnz(32, rscratch1, breakLabel);
        if (kind.getByteCount() <= 2) {
            // Compare trailing 2 bytes, if any.
            masm.bind(compare2Bytes);
            masm.ands(32, zr, result, 2);
            masm.branchConditionally(ConditionFlag.EQ, compare1Byte);
            masm.ldr(16, temp, AArch64Address.createPostIndexedImmediateAddress(array1, 2));
            masm.ldr(16, rscratch1, AArch64Address.createPostIndexedImmediateAddress(array2, 2));
            masm.eor(32, rscratch1, temp, rscratch1);
            masm.cbnz(32, rscratch1, breakLabel);
            // The one-byte tail compare is only required for boolean and byte arrays.
            if (kind.getByteCount() <= 1) {
                // Compare trailing byte, if any.
                masm.bind(compare1Byte);
                masm.ands(32, zr, result, 1);
                masm.branchConditionally(ConditionFlag.EQ, end);
                masm.ldr(8, temp, AArch64Address.createBaseRegisterOnlyAddress(array1));
                masm.ldr(8, rscratch1, AArch64Address.createBaseRegisterOnlyAddress(array2));
                masm.eor(32, rscratch1, temp, rscratch1);
                masm.cbnz(32, rscratch1, breakLabel);
            } else {
                masm.bind(compare1Byte);
            }
        } else {
            masm.bind(compare2Bytes);
        }
    }
    masm.bind(end);
    masm.mov(64, rscratch1, zr);
}
Also used : ScratchRegister(org.graalvm.compiler.asm.aarch64.AArch64MacroAssembler.ScratchRegister) Register(jdk.vm.ci.code.Register) ValueUtil.asRegister(jdk.vm.ci.code.ValueUtil.asRegister) Label(org.graalvm.compiler.asm.Label)

Example 33 with Label

use of org.graalvm.compiler.asm.Label in project graal by oracle.

the class AMD64ArrayEqualsOp method emit8ByteCompare.

/**
 * Emits code that uses 8-byte vector compares.
 */
private void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
    Label loop = new Label();
    Label compareTail = new Label();
    boolean requiresNaNCheck = kind.isNumericFloat();
    Label loopCheck = new Label();
    Label nanCheck = new Label();
    Register temp = asRegister(temp4);
    // tail count (in bytes)
    masm.andl(result, VECTOR_SIZE - 1);
    // vector count (in bytes)
    masm.andl(length, ~(VECTOR_SIZE - 1));
    masm.jcc(ConditionFlag.Zero, compareTail);
    masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
    masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
    masm.negq(length);
    // Align the main loop
    masm.align(crb.target.wordSize * 2);
    masm.bind(loop);
    masm.movq(temp, new AMD64Address(array1, length, Scale.Times1, 0));
    masm.cmpq(temp, new AMD64Address(array2, length, Scale.Times1, 0));
    masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel);
    masm.bind(loopCheck);
    masm.addq(length, VECTOR_SIZE);
    masm.jccb(ConditionFlag.NotZero, loop);
    masm.testl(result, result);
    masm.jcc(ConditionFlag.Zero, trueLabel);
    if (requiresNaNCheck) {
        // NaN check is slow path and hence placed outside of the main loop.
        Label unalignedCheck = new Label();
        masm.jmpb(unalignedCheck);
        masm.bind(nanCheck);
        // At most two iterations, unroll in the emitted code.
        for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
            emitFloatCompare(masm, array1, array2, length, offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
        }
        masm.jmpb(loopCheck);
        masm.bind(unalignedCheck);
    }
    /*
         * Compare the remaining bytes with an unaligned memory load aligned to the end of the
         * array.
         */
    masm.movq(temp, new AMD64Address(array1, result, Scale.Times1, -VECTOR_SIZE));
    masm.cmpq(temp, new AMD64Address(array2, result, Scale.Times1, -VECTOR_SIZE));
    if (requiresNaNCheck) {
        masm.jcc(ConditionFlag.Equal, trueLabel);
        // At most two iterations, unroll in the emitted code.
        for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
            emitFloatCompare(masm, array1, array2, result, -VECTOR_SIZE + offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
        }
    } else {
        masm.jccb(ConditionFlag.NotEqual, falseLabel);
    }
    masm.jmpb(trueLabel);
    masm.bind(compareTail);
    masm.movl(length, result);
}
Also used : Register(jdk.vm.ci.code.Register) ValueUtil.asRegister(jdk.vm.ci.code.ValueUtil.asRegister) Label(org.graalvm.compiler.asm.Label) AMD64Address(org.graalvm.compiler.asm.amd64.AMD64Address)

Example 34 with Label

use of org.graalvm.compiler.asm.Label in project graal by oracle.

the class AMD64ArrayEqualsOp method emitAVXCompare.

private void emitAVXCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
    assert supportsAVX2(crb.target);
    Register vector1 = asRegister(vectorTemp1, AMD64Kind.DOUBLE);
    Register vector2 = asRegister(vectorTemp2, AMD64Kind.DOUBLE);
    Label loop = new Label();
    Label compareTail = new Label();
    boolean requiresNaNCheck = kind.isNumericFloat();
    Label loopCheck = new Label();
    Label nanCheck = new Label();
    // Compare 16-byte vectors
    // tail count (in bytes)
    masm.andl(result, AVX_VECTOR_SIZE - 1);
    // vector count (in bytes)
    masm.andl(length, ~(AVX_VECTOR_SIZE - 1));
    masm.jcc(ConditionFlag.Zero, compareTail);
    masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
    masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
    masm.negq(length);
    // Align the main loop
    masm.align(crb.target.wordSize * 2);
    masm.bind(loop);
    masm.vmovdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
    masm.vmovdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
    masm.vpxor(vector1, vector1, vector2);
    masm.vptest(vector1, vector1);
    masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
    masm.bind(loopCheck);
    masm.addq(length, AVX_VECTOR_SIZE);
    masm.jcc(ConditionFlag.NotZero, loop);
    masm.testl(result, result);
    masm.jcc(ConditionFlag.Zero, trueLabel);
    if (requiresNaNCheck) {
        Label unalignedCheck = new Label();
        masm.jmpb(unalignedCheck);
        masm.bind(nanCheck);
        emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, AVX_VECTOR_SIZE);
        masm.jmpb(loopCheck);
        masm.bind(unalignedCheck);
    }
    /*
         * Compare the remaining bytes with an unaligned memory load aligned to the end of the
         * array.
         */
    masm.vmovdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -AVX_VECTOR_SIZE));
    masm.vmovdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -AVX_VECTOR_SIZE));
    masm.vpxor(vector1, vector1, vector2);
    masm.vptest(vector1, vector1);
    if (requiresNaNCheck) {
        masm.jcc(ConditionFlag.Zero, trueLabel);
        emitFloatCompareWithinRange(crb, masm, array1, array2, result, -AVX_VECTOR_SIZE, falseLabel, AVX_VECTOR_SIZE);
    } else {
        masm.jcc(ConditionFlag.NotZero, falseLabel);
    }
    masm.jmp(trueLabel);
    masm.bind(compareTail);
    masm.movl(length, result);
}
Also used : Register(jdk.vm.ci.code.Register) ValueUtil.asRegister(jdk.vm.ci.code.ValueUtil.asRegister) Label(org.graalvm.compiler.asm.Label) AMD64Address(org.graalvm.compiler.asm.amd64.AMD64Address)

Aggregations

Label (org.graalvm.compiler.asm.Label)34 Register (jdk.vm.ci.code.Register)27 ValueUtil.asRegister (jdk.vm.ci.code.ValueUtil.asRegister)23 AMD64Address (org.graalvm.compiler.asm.amd64.AMD64Address)15 ArrayDataPointerConstant (org.graalvm.compiler.lir.asm.ArrayDataPointerConstant)7 ScratchRegister (org.graalvm.compiler.asm.aarch64.AArch64MacroAssembler.ScratchRegister)5 SPARCAddress (org.graalvm.compiler.asm.sparc.SPARCAddress)4 RegisterConfig (jdk.vm.ci.code.RegisterConfig)3 CompressEncoding (org.graalvm.compiler.core.common.CompressEncoding)3 FrameMap (org.graalvm.compiler.lir.framemap.FrameMap)3 TruffleCallBoundaryInstrumentation (org.graalvm.compiler.truffle.compiler.hotspot.TruffleCallBoundaryInstrumentation)3 AArch64MacroAssembler (org.graalvm.compiler.asm.aarch64.AArch64MacroAssembler)2 AMD64MacroAssembler (org.graalvm.compiler.asm.amd64.AMD64MacroAssembler)2 SPARCMacroAssembler (org.graalvm.compiler.asm.sparc.SPARCMacroAssembler)2 ScratchRegister (org.graalvm.compiler.asm.sparc.SPARCMacroAssembler.ScratchRegister)2 CallingConvention (jdk.vm.ci.code.CallingConvention)1 ValueUtil.isRegister (jdk.vm.ci.code.ValueUtil.isRegister)1 AArch64HotSpotRegisterConfig (jdk.vm.ci.hotspot.aarch64.AArch64HotSpotRegisterConfig)1 AArch64Address (org.graalvm.compiler.asm.aarch64.AArch64Address)1 Scale (org.graalvm.compiler.asm.amd64.AMD64Address.Scale)1